Files
agnostic_orderbook
ahash
aho_corasick
arrayref
arrayvec
atty
base64
bincode
blake3
block_buffer
block_padding
borsh
borsh_derive
borsh_derive_internal
borsh_schema_derive_internal
bs58
bv
bytemuck
byteorder
cfg_if
constant_time_eq
cpufeatures
crunchy
crypto_mac
curve25519_dalek
derivative
dex_v3
digest
either
enumflags2
enumflags2_derive
env_logger
generic_array
getrandom
hashbrown
hex
hmac
hmac_drbg
humantime
itertools
keccak
lazy_static
libc
libm
libsecp256k1
libsecp256k1_core
log
memchr
memmap2
num_derive
num_enum
num_enum_derive
num_traits
opaque_debug
ppv_lite86
proc_macro2
quote
rand
rand_chacha
rand_core
rand_pcg
regex
regex_syntax
rustversion
serde
serde_bytes
serde_derive
sha2
sha3
solana_frozen_abi
solana_frozen_abi_macro
solana_logger
solana_program
solana_sdk_macro
spin
spl_token
subtle
syn
synstructure
termcolor
thiserror
thiserror_impl
typenum
unicode_xid
zeroize
zeroize_derive
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
use core::mem::size_of;

use crate::memmem::{
    prefilter::{PrefilterFnTy, PrefilterState},
    vector::Vector,
    NeedleInfo,
};

/// The implementation of the forward vector accelerated candidate finder.
///
/// This is inspired by the "generic SIMD" algorithm described here:
/// http://0x80.pl/articles/simd-strfind.html#algorithm-1-generic-simd
///
/// The main difference is that this is just a prefilter. That is, it reports
/// candidates once they are seen and doesn't attempt to confirm them. Also,
/// the bytes this routine uses to check for candidates are selected based on
/// an a priori background frequency distribution. This means that on most
/// haystacks, this will on average spend more time in vectorized code than you
/// would if you just selected the first and last bytes of the needle.
///
/// Note that a non-prefilter variant of this algorithm can be found in the
/// parent module, but it only works on smaller needles.
///
/// `prestate`, `ninfo`, `haystack` and `needle` are the four prefilter
/// function parameters. `fallback` is a prefilter that is used if the haystack
/// is too small to be handled with the given vector size.
///
/// This routine is not safe because it is intended for callers to specialize
/// this with a particular vector (e.g., __m256i) and then call it with the
/// relevant target feature (e.g., avx2) enabled.
///
/// # Panics
///
/// If `needle.len() <= 1`, then this panics.
///
/// # Safety
///
/// Since this is meant to be used with vector functions, callers need to
/// specialize this inside of a function with a `target_feature` attribute.
/// Therefore, callers must ensure that whatever target feature is being used
/// supports the vector functions that this function is specialized for. (For
/// the specific vector functions used, see the Vector trait implementations.)
#[inline(always)]
pub(crate) unsafe fn find<V: Vector>(
    prestate: &mut PrefilterState,
    ninfo: &NeedleInfo,
    haystack: &[u8],
    needle: &[u8],
    fallback: PrefilterFnTy,
) -> Option<usize> {
    assert!(needle.len() >= 2, "needle must be at least 2 bytes");
    let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_usize();
    let min_haystack_len = rare2i + size_of::<V>();
    if haystack.len() < min_haystack_len {
        return fallback(prestate, ninfo, haystack, needle);
    }

    let start_ptr = haystack.as_ptr();
    let end_ptr = start_ptr.add(haystack.len());
    let max_ptr = end_ptr.sub(min_haystack_len);
    let mut ptr = start_ptr;

    let rare1chunk = V::splat(needle[rare1i]);
    let rare2chunk = V::splat(needle[rare2i]);

    // N.B. I did experiment with unrolling the loop to deal with size(V)
    // bytes at a time and 2*size(V) bytes at a time. The double unroll
    // was marginally faster while the quadruple unroll was unambiguously
    // slower. In the end, I decided the complexity from unrolling wasn't
    // worth it. I used the memmem/krate/prebuilt/huge-en/ benchmarks to
    // compare.
    while ptr <= max_ptr {
        let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
        if let Some(chunki) = m {
            return Some(matched(prestate, start_ptr, ptr, chunki));
        }
        ptr = ptr.add(size_of::<V>());
    }
    if ptr < end_ptr {
        // This routine immediately quits if a candidate match is found.
        // That means that if we're here, no candidate matches have been
        // found at or before 'ptr'. Thus, we don't need to mask anything
        // out even though we might technically search part of the haystack
        // that we've already searched (because we know it can't match).
        ptr = max_ptr;
        let m = find_in_chunk2(ptr, rare1i, rare2i, rare1chunk, rare2chunk);
        if let Some(chunki) = m {
            return Some(matched(prestate, start_ptr, ptr, chunki));
        }
    }
    prestate.update(haystack.len());
    None
}

// Below are two different techniques for checking whether a candidate
// match exists in a given chunk or not. find_in_chunk2 checks two bytes
// where as find_in_chunk3 checks three bytes. The idea behind checking
// three bytes is that while we do a bit more work per iteration, we
// decrease the chances of a false positive match being reported and thus
// make the search faster overall. This actually works out for the
// memmem/krate/prebuilt/huge-en/never-all-common-bytes benchmark, where
// using find_in_chunk3 is about 25% faster than find_in_chunk2. However,
// it turns out that find_in_chunk2 is faster for all other benchmarks, so
// perhaps the extra check isn't worth it in practice.
//
// For now, we go with find_in_chunk2, but we leave find_in_chunk3 around
// to make it easy to switch to and benchmark when possible.

/// Search for an occurrence of two rare bytes from the needle in the current
/// chunk pointed to by ptr.
///
/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2
/// bytes repeated in each 8-bit lane, respectively.
///
/// # Safety
///
/// It must be safe to do an unaligned read of size(V) bytes starting at both
/// (ptr + rare1i) and (ptr + rare2i).
#[inline(always)]
unsafe fn find_in_chunk2<V: Vector>(
    ptr: *const u8,
    rare1i: usize,
    rare2i: usize,
    rare1chunk: V,
    rare2chunk: V,
) -> Option<usize> {
    let chunk0 = V::load_unaligned(ptr.add(rare1i));
    let chunk1 = V::load_unaligned(ptr.add(rare2i));

    let eq0 = chunk0.cmpeq(rare1chunk);
    let eq1 = chunk1.cmpeq(rare2chunk);

    let match_offsets = eq0.and(eq1).movemask();
    if match_offsets == 0 {
        return None;
    }
    Some(match_offsets.trailing_zeros() as usize)
}

/// Search for an occurrence of two rare bytes and the first byte (even if one
/// of the rare bytes is equivalent to the first byte) from the needle in the
/// current chunk pointed to by ptr.
///
/// firstchunk, rare1chunk and rare2chunk correspond to vectors with the first,
/// rare1 and rare2 bytes repeated in each 8-bit lane, respectively.
///
/// # Safety
///
/// It must be safe to do an unaligned read of size(V) bytes starting at ptr,
/// (ptr + rare1i) and (ptr + rare2i).
#[allow(dead_code)]
#[inline(always)]
unsafe fn find_in_chunk3<V: Vector>(
    ptr: *const u8,
    rare1i: usize,
    rare2i: usize,
    firstchunk: V,
    rare1chunk: V,
    rare2chunk: V,
) -> Option<usize> {
    let chunk0 = V::load_unaligned(ptr);
    let chunk1 = V::load_unaligned(ptr.add(rare1i));
    let chunk2 = V::load_unaligned(ptr.add(rare2i));

    let eq0 = chunk0.cmpeq(firstchunk);
    let eq1 = chunk1.cmpeq(rare1chunk);
    let eq2 = chunk2.cmpeq(rare2chunk);

    let match_offsets = eq0.and(eq1).and(eq2).movemask();
    if match_offsets == 0 {
        return None;
    }
    Some(match_offsets.trailing_zeros() as usize)
}

/// Accepts a chunk-relative offset and returns a haystack relative offset
/// after updating the prefilter state.
///
/// Why do we use this unlineable function when a search completes? Well,
/// I don't know. Really. Obviously this function was not here initially.
/// When doing profiling, the codegen for the inner loop here looked bad and
/// I didn't know why. There were a couple extra 'add' instructions and an
/// extra 'lea' instruction that I couldn't explain. I hypothesized that the
/// optimizer was having trouble untangling the hot code in the loop from the
/// code that deals with a candidate match. By putting the latter into an
/// unlineable function, it kind of forces the issue and it had the intended
/// effect: codegen improved measurably. It's good for a ~10% improvement
/// across the board on the memmem/krate/prebuilt/huge-en/ benchmarks.
#[cold]
#[inline(never)]
fn matched(
    prestate: &mut PrefilterState,
    start_ptr: *const u8,
    ptr: *const u8,
    chunki: usize,
) -> usize {
    let found = diff(ptr, start_ptr) + chunki;
    prestate.update(found);
    found
}

/// Subtract `b` from `a` and return the difference. `a` must be greater than
/// or equal to `b`.
fn diff(a: *const u8, b: *const u8) -> usize {
    debug_assert!(a >= b);
    (a as usize) - (b as usize)
}