Skip to content

Commit b8961a4

Browse files
linear banding strategy
1 parent 527bc53 commit b8961a4

3 files changed

Lines changed: 31 additions & 6 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ In July 2023, we started populating a [Github repository](https://github.com/dan
122122

123123
## Version History
124124

125+
* 2026-01-13 v1.4.6: Introduced linear banding strategy, to be used as default in future releases (will require re-creation of the whole index).
125126
* 2025-12-22 v1.4.5: Fixed a bug due to early conversion when fetching many FunctionEntries at once, which would crash if one function ID does not exist.
126127
* 2025-12-22 v1.4.4: No changes, just moved plugins to their own repo located at [mcrit-plugins](https://github.com/danielplohmann/mcrit-plugins).
127128
* 2025-12-08 v1.4.3: Major improvements to MCRIT IDA plugin UI, backend now supports faster cross matching jobs only matching among selected samples, minor bugfixes.

mcrit/config/StorageConfig.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ class StorageConfig(ConfigInterface):
2525
STORAGE_MONGODB_CLEANUP_TTL: int = 60 * 60 * 24 * 7
2626
# Once MinHashes have been calculated, discard disassembly from function entries
2727
STORAGE_DROP_DISASSEMBLY: bool = False
28+
# supported strategies:
29+
# * random: randomly sample from minhash fields, possibly more fuzziness likely won't use all minhash fields
30+
# * linear: use a sequential selection of minhash fields, requires size*number=MINHASH_SIGNATURE_LENGTH
31+
STORAGE_BAND_STRATEGY = "random"
2832
# random seed to be used when deriving sequences used as bands
2933
STORAGE_BAND_SEED: int = 0xDEADBEEF
3034
# Banding supports:

mcrit/storage/StorageInterface.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -686,14 +686,34 @@ def createBandhashProjection(self, minhash):
686686
a dict containing signature indices used for bandhashing by band id
687687
"""
688688
band_projection = {}
689-
random.seed(self._storage_config.STORAGE_BAND_SEED)
690-
band_index = 0
691-
for band_size, num_bands in self._storage_config.STORAGE_BANDS.items():
689+
banding_strategy = getattr(self._storage_config, "STORAGE_BAND_STRATEGY", "random")
690+
if banding_strategy == "random":
691+
random.seed(self._storage_config.STORAGE_BAND_SEED)
692+
band_index = 0
693+
for band_size, num_bands in self._storage_config.STORAGE_BANDS.items():
694+
for _ in range(num_bands):
695+
index_sequence = [index for index in range(len(minhash.getMinHashInt()))]
696+
random.shuffle(index_sequence)
697+
band_projection[band_index] = index_sequence[:band_size]
698+
band_index += 1
699+
elif banding_strategy == "linear":
700+
band_index = 0
701+
if len(self._storage_config.STORAGE_BANDS) != 1:
702+
raise AttributeError("When using STORAGE_BAND_STRATEGY: linear, use only a single band_size definition - recommended: 4.")
703+
size_num_tuples = [i for i in self._storage_config.STORAGE_BANDS.items()][0]
704+
band_size = size_num_tuples[0]
705+
num_bands = size_num_tuples[1]
706+
if not band_size * num_bands == self._minhash_config.MINHASH_SIGNATURE_LENGTH:
707+
raise AttributeError("When using STORAGE_BAND_STRATEGY: linear, keep product of band_size (%d) and num_bands (%d) equal to MINHASH_SIGNATURE_LENGTH (%d) - recommended: 4/16/64." % (band_size, num_bands, self._minhash_config.MINHASH_SIGNATURE_LENGTH))
692708
for _ in range(num_bands):
693-
index_sequence = [index for index in range(len(minhash.getMinHashInt()))]
694-
random.shuffle(index_sequence)
695-
band_projection[band_index] = index_sequence[:band_size]
709+
index_sequence = []
710+
step_size = int(self._minhash_config.MINHASH_SIGNATURE_LENGTH / band_size)
711+
for index_num in range(band_size):
712+
index_sequence.append(index_num * step_size + band_index)
713+
band_projection[band_index] = index_sequence
696714
band_index += 1
715+
else:
716+
raise AttributeError("unrecognized STORAGE_BAND_STRATEGY in STORAGE_CONFIG")
697717
return band_projection
698718

699719
# -> Dict[BandIndex, BandHash]

0 commit comments

Comments
 (0)