Skip to content

Commit 3a536b1

Browse files
alphabet related code in src/pydna/alphabet.py
1 parent 5e60e26 commit 3a536b1

File tree

1 file changed

+329
-0
lines changed

1 file changed

+329
-0
lines changed

src/pydna/alphabet.py

Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Six multiline strings are defined in this file.
6+
7+
codestrings["un_ambiguous_ds_dna"]
8+
codestrings["ambiguous_ds_dna"]
9+
codestrings["ds_rna"]
10+
codestrings["single_stranded_dna_rna"]
11+
codestrings["mismatched_dna_rna"]
12+
codestrings["loops_dna_rna"]
13+
14+
Each string has five lines and describe the DNA alphabet
15+
used in Pydna in this form:
16+
17+
W 1
18+
| 2
19+
C 3
20+
<empty line> 4
21+
S 5
22+
23+
W (line 1) and C (line 2) are complementary bases in a double stranded DNA molecule and S (line 5) are
24+
the symbols of the alphabet used to describe the base pair above the symbol.
25+
26+
"""
27+
28+
emptyspace = chr(32)
29+
30+
codestrings = dict()
31+
32+
33+
codestrings[
34+
"un_ambiguous_ds_dna"
35+
] = """\
36+
GATC
37+
||||
38+
CTAG
39+
40+
GATC
41+
"""
42+
43+
codestrings[
44+
"ambiguous_ds_dna"
45+
] = """\
46+
RYMKSWHBVDN
47+
|||||||||||
48+
YRKMSWDVBHN
49+
50+
RYMKSWHBVDN
51+
"""
52+
53+
codestrings[
54+
"ds_rna"
55+
] = """\
56+
UA
57+
||
58+
AU
59+
60+
UO
61+
"""
62+
63+
codestrings["single_stranded_dna_rna"] = (
64+
"""\
65+
GATC....U.
66+
||||||||||
67+
....CTAG.U
68+
69+
PEXIQFZJ$%
70+
""".replace(
71+
".", emptyspace
72+
)
73+
)
74+
75+
codestrings[
76+
"mismatched_dna_rna"
77+
] = """\
78+
AAACCCGGGTTTUUUGCT
79+
||||||||||||||||||
80+
ACGACTAGTCGTGCTUUU
81+
82+
!#{}&*()<>@:?[]=_;
83+
"""
84+
85+
codestrings[
86+
"loops_dna_rna"
87+
] = """\
88+
-----AGCTU
89+
||||||||||
90+
AGCTU-----
91+
92+
0123456789
93+
"""
94+
95+
keys = set(
96+
(
97+
"un_ambiguous_ds_dna",
98+
"ambiguous_ds_dna",
99+
"ds_rna",
100+
"single_stranded_dna_rna",
101+
"mismatched_dna_rna",
102+
"loops_dna_rna",
103+
)
104+
)
105+
106+
assert set(codestrings.keys()) == keys
107+
108+
not_dscode = "lL\"',-./\\^`|+~"
109+
110+
for name, codestring in codestrings.items():
111+
112+
# This loops all codestrings and checks for consistency of format.
113+
lines = codestring.splitlines()
114+
115+
assert len(lines) == 5
116+
117+
# We want the Watson, Crick and Symbol lines only
118+
# Second line has to be pipes ("|") and fourth has to be empty
119+
120+
watsn, pipes, crick, empty, symbl = lines
121+
122+
assert all(ln.isascii() for ln in (watsn, crick, symbl))
123+
124+
assert all(ln.isupper() for ln in (watsn, crick, symbl) if ln.isalpha())
125+
126+
# check so that pipes contain only "|"
127+
assert set(pipes) == set("|")
128+
129+
# check so strings are the same length
130+
assert all(len(ln) == len(watsn) for ln in (watsn, pipes, crick, symbl))
131+
132+
# These characters are not used.
133+
assert not any([letter in not_dscode for letter in symbl])
134+
135+
136+
codes = dict()
137+
138+
for name, codestring in codestrings.items():
139+
140+
lines = codestring.splitlines()
141+
142+
watsons, _, cricks, _, symbols = lines
143+
144+
codes[name] = dct = dict()
145+
146+
for watson, crick, symbol in zip(watsons, cricks, symbols):
147+
if watson == emptyspace:
148+
dct[watson, crick.lower()] = symbol.lower()
149+
dct[watson, crick.upper()] = symbol.upper()
150+
else:
151+
dct[watson.upper(), crick.upper()] = symbol.upper()
152+
dct[watson.upper(), crick.lower()] = symbol.upper()
153+
dct[watson.lower(), crick.upper()] = symbol.lower()
154+
dct[watson.lower(), crick.lower()] = symbol.lower()
155+
156+
157+
bp_dict_str = (
158+
codes["un_ambiguous_ds_dna"]
159+
| codes["ambiguous_ds_dna"]
160+
| codes["ds_rna"]
161+
| codes["single_stranded_dna_rna"]
162+
# | codes["mismatched_dna_rna"]
163+
# | codes["loops_dna_rna"]
164+
)
165+
166+
bp_dict = {
167+
(w.encode("ascii"), c.encode("ascii")): s.encode("ascii")
168+
for (w, c), s in bp_dict_str.items()
169+
}
170+
171+
temp = codes["un_ambiguous_ds_dna"] | codes["ds_rna"]
172+
173+
174+
annealing_dict_str = dict()
175+
176+
# The annealing_dict_str is constructed below. This dict contains the information needed
177+
# to tell if two DNA fragments (like a and b below) can anneal. The dict has the form (x, y): s
178+
# Where x and y are bases in a and b and the symbol s is the resulting symbol for the base pair
179+
# that is formed. One element in the dict is ('P', 'Q'): 'G' which matches the first
180+
# of the four new base pairings formed between a and b in the example below.
181+
#
182+
#
183+
# (a)
184+
# gggPEXI (dscode for a)
185+
#
186+
# gggGATC
187+
# ccc
188+
# aaa (b)
189+
# CTAGttt
190+
#
191+
# QFZJaaa (dscode for b)
192+
#
193+
#
194+
# gggGATCaaa (annealing product between a and b)
195+
# cccCTAGttt
196+
#
197+
# This loops through the base pairs where the upper or lower
198+
# positions are empty. (w, c), s would be ("G", " "), "P"
199+
# in the first iteration.
200+
#
201+
202+
d = codes["single_stranded_dna_rna"] # Alias to make the code below more readable.
203+
204+
for (x, y), symbol in d.items():
205+
if y == emptyspace:
206+
other = next(b for a, b in temp if a == x)
207+
symbol_other = d[emptyspace, other]
208+
annealing_dict_str[symbol, symbol_other] = temp[x, other]
209+
annealing_dict_str[symbol_other, symbol] = temp[x, other]
210+
elif x == emptyspace:
211+
other = next(a for a, b in temp if b == y)
212+
symbol_other = d[other, emptyspace]
213+
annealing_dict_str[symbol, symbol_other] = temp[other, y]
214+
annealing_dict_str[symbol_other, symbol] = temp[other, y]
215+
else:
216+
raise ValueError("This should not happen")
217+
218+
del d
219+
220+
mixed_case_dict = (
221+
dict()
222+
) # This dict will contain upper and lower case symbols annealing_dict_str
223+
224+
for (x, y), symbol in annealing_dict_str.items():
225+
mixed_case_dict[x.upper(), y.lower()] = symbol.upper()
226+
mixed_case_dict[x.lower(), y.upper()] = symbol.lower()
227+
mixed_case_dict[x.lower(), y.lower()] = symbol.lower()
228+
229+
annealing_dict_str = (
230+
annealing_dict_str | mixed_case_dict
231+
) # Add mixed case entries to the dict
232+
233+
# A bytestr version of the annealing_dict_str
234+
annealing_dict = {
235+
(x.encode("ascii"), y.encode("ascii")): s.encode("ascii")
236+
for (x, y), s in annealing_dict_str.items()
237+
}
238+
239+
dscode_sense = []
240+
dscode_compl = []
241+
watson = []
242+
crick = []
243+
244+
for (w, c), s in bp_dict.items():
245+
246+
if w.isupper() and c.islower() or w.islower() and c.isupper():
247+
continue
248+
249+
dscode_sense.append(s)
250+
dscode_compl.append(bp_dict[c, w])
251+
watson.append(w)
252+
crick.append(c)
253+
254+
complement_table_dscode = bytes.maketrans(
255+
b"".join(dscode_sense), b"".join(dscode_compl)
256+
)
257+
258+
placeholder1, placeholder2, interval, empty_bs = (
259+
b"~",
260+
b"+",
261+
b".",
262+
emptyspace.encode("ascii"),
263+
)
264+
265+
for bstring in placeholder1, placeholder2, interval:
266+
assert all(letter in not_dscode.encode("ascii") for letter in bstring)
267+
268+
dscode_to_watson_table = bytes.maketrans(
269+
b"".join(dscode_sense) + placeholder1 + placeholder2,
270+
b"".join(watson) + empty_bs + interval,
271+
)
272+
273+
dscode_to_crick_table = bytes.maketrans(
274+
b"".join(dscode_sense) + placeholder1 + placeholder2,
275+
b"".join(crick) + interval + empty_bs,
276+
)
277+
278+
279+
watson_tail_letter_dict = {
280+
(w.encode("ascii")): s.encode("ascii")
281+
for (w, c), s in codes["single_stranded_dna_rna"].items()
282+
if c.isspace()
283+
}
284+
285+
from_letters = b"".join(watson_tail_letter_dict.keys())
286+
287+
to_letters = b"".join(watson_tail_letter_dict.values())
288+
289+
dscode_to_crick_tail_table = bytes.maketrans(from_letters, to_letters)
290+
# dscode_to_crick_tail_table = bytes.maketrans(b"GATCgatc", b"PEXIpexi")
291+
292+
293+
crick_tail_letter_dict = {
294+
(c.encode("ascii")): s.encode("ascii")
295+
for (w, c), s in codes["single_stranded_dna_rna"].items()
296+
if w.isspace()
297+
}
298+
299+
from_letters = b"".join(crick_tail_letter_dict.keys())
300+
301+
to_letters = b"".join(crick_tail_letter_dict.values())
302+
303+
dscode_to_watson_tail_table = bytes.maketrans(from_letters, to_letters)
304+
dscode_to_watson_tail_table = bytes.maketrans(b"GATCgatc", b"QFZJqfzj")
305+
306+
307+
dscode_to_to_full_sequence_table = bytes.maketrans(
308+
b"PEXIpexiQFZJqfzj", b"GATCgatcGATCgatc"
309+
)
310+
311+
312+
iupac_compl_regex = { # IUPAC Ambiguity Code complements
313+
"A": "(?:T|U)",
314+
"C": "(?:G)",
315+
"G": "(?:C)",
316+
"T": "(?:A)",
317+
"U": "(?:A)",
318+
"R": "(?:T|C|Y)",
319+
"Y": "(?:G|A|R)",
320+
"S": "(?:G|C|S)",
321+
"W": "(?:A|T|W)",
322+
"K": "(?:C|AM)",
323+
"M": "(?:T|G|K)",
324+
"B": "(?:C|G|A|V)",
325+
"D": "(?:A|C|T|H)",
326+
"H": "(?:A|G|T|D)",
327+
"V": "(?:T|C|G|B)",
328+
"N": "(?:A|G|C|T|N)",
329+
}

0 commit comments

Comments
 (0)