-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy path29-sequence-mining.py
More file actions
32 lines (21 loc) · 820 Bytes
/
29-sequence-mining.py
File metadata and controls
32 lines (21 loc) · 820 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import itertools, collections
def create_substrings(data, max_int):
for seq_index, seq in enumerate(data):
for step in range(1, max_int+1):
for sub_seq in range(len(seq) - (step - 1)):
slice_ = seq[sub_seq:sub_seq+step]
yield seq_index, slice_
def seq_mining(data, prop, max_int):
positions = collections.defaultdict(set)
max_length = max([len(sequence) for sequence in data])
max_int = max_length if max_int > max_length else max_int
for i, seq in create_substrings(data, max_int):
positions[seq].add(i)
data_len = len(data)
counter = collections.Counter()
for l, t in positions.items():
t_len = len(t)
if t_len/data_len < prop:
continue
counter[l] = len(t)
return counter