|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +""" |
| 5 | +Six multiline strings are defined in this file. |
| 6 | +
|
| 7 | +codestrings["un_ambiguous_ds_dna"] |
| 8 | +codestrings["ambiguous_ds_dna"] |
| 9 | +codestrings["ds_rna"] |
| 10 | +codestrings["single_stranded_dna_rna"] |
| 11 | +codestrings["mismatched_dna_rna"] |
| 12 | +codestrings["loops_dna_rna"] |
| 13 | +
|
| 14 | +Each string has five lines and describe the DNA alphabet |
| 15 | +used in Pydna in this form: |
| 16 | +
|
| 17 | +W 1 |
| 18 | +| 2 |
| 19 | +C 3 |
| 20 | +<empty line> 4 |
| 21 | +S 5 |
| 22 | +
|
| 23 | +W (line 1) and C (line 2) are complementary bases in a double stranded DNA molecule and S (line 5) are |
| 24 | +the symbols of the alphabet used to describe the base pair above the symbol. |
| 25 | +
|
| 26 | +""" |
| 27 | + |
| 28 | +emptyspace = chr(32) |
| 29 | + |
| 30 | +codestrings = dict() |
| 31 | + |
| 32 | + |
| 33 | +codestrings[ |
| 34 | + "un_ambiguous_ds_dna" |
| 35 | +] = """\ |
| 36 | +GATC |
| 37 | +|||| |
| 38 | +CTAG |
| 39 | +
|
| 40 | +GATC |
| 41 | +""" |
| 42 | + |
| 43 | +codestrings[ |
| 44 | + "ambiguous_ds_dna" |
| 45 | +] = """\ |
| 46 | +RYMKSWHBVDN |
| 47 | +||||||||||| |
| 48 | +YRKMSWDVBHN |
| 49 | +
|
| 50 | +RYMKSWHBVDN |
| 51 | +""" |
| 52 | + |
| 53 | +codestrings[ |
| 54 | + "ds_rna" |
| 55 | +] = """\ |
| 56 | +UA |
| 57 | +|| |
| 58 | +AU |
| 59 | +
|
| 60 | +UO |
| 61 | +""" |
| 62 | + |
| 63 | +codestrings["single_stranded_dna_rna"] = ( |
| 64 | + """\ |
| 65 | +GATC....U. |
| 66 | +|||||||||| |
| 67 | +....CTAG.U |
| 68 | +
|
| 69 | +PEXIQFZJ$% |
| 70 | +""".replace( |
| 71 | + ".", emptyspace |
| 72 | + ) |
| 73 | +) |
| 74 | + |
| 75 | +codestrings[ |
| 76 | + "mismatched_dna_rna" |
| 77 | +] = """\ |
| 78 | +AAACCCGGGTTTUUUGCT |
| 79 | +|||||||||||||||||| |
| 80 | +ACGACTAGTCGTGCTUUU |
| 81 | +
|
| 82 | +!#{}&*()<>@:?[]=_; |
| 83 | +""" |
| 84 | + |
| 85 | +codestrings[ |
| 86 | + "loops_dna_rna" |
| 87 | +] = """\ |
| 88 | +-----AGCTU |
| 89 | +|||||||||| |
| 90 | +AGCTU----- |
| 91 | +
|
| 92 | +0123456789 |
| 93 | +""" |
| 94 | + |
| 95 | +keys = set( |
| 96 | + ( |
| 97 | + "un_ambiguous_ds_dna", |
| 98 | + "ambiguous_ds_dna", |
| 99 | + "ds_rna", |
| 100 | + "single_stranded_dna_rna", |
| 101 | + "mismatched_dna_rna", |
| 102 | + "loops_dna_rna", |
| 103 | + ) |
| 104 | +) |
| 105 | + |
| 106 | +assert set(codestrings.keys()) == keys |
| 107 | + |
| 108 | +not_dscode = "lL\"',-./\\^`|+~" |
| 109 | + |
| 110 | +for name, codestring in codestrings.items(): |
| 111 | + |
| 112 | + # This loops all codestrings and checks for consistency of format. |
| 113 | + lines = codestring.splitlines() |
| 114 | + |
| 115 | + assert len(lines) == 5 |
| 116 | + |
| 117 | + # We want the Watson, Crick and Symbol lines only |
| 118 | + # Second line has to be pipes ("|") and fourth has to be empty |
| 119 | + |
| 120 | + watsn, pipes, crick, empty, symbl = lines |
| 121 | + |
| 122 | + assert all(ln.isascii() for ln in (watsn, crick, symbl)) |
| 123 | + |
| 124 | + assert all(ln.isupper() for ln in (watsn, crick, symbl) if ln.isalpha()) |
| 125 | + |
| 126 | + # check so that pipes contain only "|" |
| 127 | + assert set(pipes) == set("|") |
| 128 | + |
| 129 | + # check so strings are the same length |
| 130 | + assert all(len(ln) == len(watsn) for ln in (watsn, pipes, crick, symbl)) |
| 131 | + |
| 132 | + # These characters are not used. |
| 133 | + assert not any([letter in not_dscode for letter in symbl]) |
| 134 | + |
| 135 | + |
| 136 | +codes = dict() |
| 137 | + |
| 138 | +for name, codestring in codestrings.items(): |
| 139 | + |
| 140 | + lines = codestring.splitlines() |
| 141 | + |
| 142 | + watsons, _, cricks, _, symbols = lines |
| 143 | + |
| 144 | + codes[name] = dct = dict() |
| 145 | + |
| 146 | + for watson, crick, symbol in zip(watsons, cricks, symbols): |
| 147 | + if watson == emptyspace: |
| 148 | + dct[watson, crick.lower()] = symbol.lower() |
| 149 | + dct[watson, crick.upper()] = symbol.upper() |
| 150 | + else: |
| 151 | + dct[watson.upper(), crick.upper()] = symbol.upper() |
| 152 | + dct[watson.upper(), crick.lower()] = symbol.upper() |
| 153 | + dct[watson.lower(), crick.upper()] = symbol.lower() |
| 154 | + dct[watson.lower(), crick.lower()] = symbol.lower() |
| 155 | + |
| 156 | + |
| 157 | +bp_dict_str = ( |
| 158 | + codes["un_ambiguous_ds_dna"] |
| 159 | + | codes["ambiguous_ds_dna"] |
| 160 | + | codes["ds_rna"] |
| 161 | + | codes["single_stranded_dna_rna"] |
| 162 | + # | codes["mismatched_dna_rna"] |
| 163 | + # | codes["loops_dna_rna"] |
| 164 | +) |
| 165 | + |
| 166 | +bp_dict = { |
| 167 | + (w.encode("ascii"), c.encode("ascii")): s.encode("ascii") |
| 168 | + for (w, c), s in bp_dict_str.items() |
| 169 | +} |
| 170 | + |
| 171 | +temp = codes["un_ambiguous_ds_dna"] | codes["ds_rna"] |
| 172 | + |
| 173 | + |
| 174 | +annealing_dict_str = dict() |
| 175 | + |
| 176 | +# The annealing_dict_str is constructed below. This dict contains the information needed |
| 177 | +# to tell if two DNA fragments (like a and b below) can anneal. The dict has the form (x, y): s |
| 178 | +# Where x and y are bases in a and b and the symbol s is the resulting symbol for the base pair |
| 179 | +# that is formed. One element in the dict is ('P', 'Q'): 'G' which matches the first |
| 180 | +# of the four new base pairings formed between a and b in the example below. |
| 181 | +# |
| 182 | +# |
| 183 | +# (a) |
| 184 | +# gggPEXI (dscode for a) |
| 185 | +# |
| 186 | +# gggGATC |
| 187 | +# ccc |
| 188 | +# aaa (b) |
| 189 | +# CTAGttt |
| 190 | +# |
| 191 | +# QFZJaaa (dscode for b) |
| 192 | +# |
| 193 | +# |
| 194 | +# gggGATCaaa (annealing product between a and b) |
| 195 | +# cccCTAGttt |
| 196 | +# |
| 197 | +# This loops through the base pairs where the upper or lower |
| 198 | +# positions are empty. (w, c), s would be ("G", " "), "P" |
| 199 | +# in the first iteration. |
| 200 | +# |
| 201 | + |
| 202 | +d = codes["single_stranded_dna_rna"] # Alias to make the code below more readable. |
| 203 | + |
| 204 | +for (x, y), symbol in d.items(): |
| 205 | + if y == emptyspace: |
| 206 | + other = next(b for a, b in temp if a == x) |
| 207 | + symbol_other = d[emptyspace, other] |
| 208 | + annealing_dict_str[symbol, symbol_other] = temp[x, other] |
| 209 | + annealing_dict_str[symbol_other, symbol] = temp[x, other] |
| 210 | + elif x == emptyspace: |
| 211 | + other = next(a for a, b in temp if b == y) |
| 212 | + symbol_other = d[other, emptyspace] |
| 213 | + annealing_dict_str[symbol, symbol_other] = temp[other, y] |
| 214 | + annealing_dict_str[symbol_other, symbol] = temp[other, y] |
| 215 | + else: |
| 216 | + raise ValueError("This should not happen") |
| 217 | + |
| 218 | +del d |
| 219 | + |
| 220 | +mixed_case_dict = ( |
| 221 | + dict() |
| 222 | +) # This dict will contain upper and lower case symbols annealing_dict_str |
| 223 | + |
| 224 | +for (x, y), symbol in annealing_dict_str.items(): |
| 225 | + mixed_case_dict[x.upper(), y.lower()] = symbol.upper() |
| 226 | + mixed_case_dict[x.lower(), y.upper()] = symbol.lower() |
| 227 | + mixed_case_dict[x.lower(), y.lower()] = symbol.lower() |
| 228 | + |
| 229 | +annealing_dict_str = ( |
| 230 | + annealing_dict_str | mixed_case_dict |
| 231 | +) # Add mixed case entries to the dict |
| 232 | + |
| 233 | +# A bytestr version of the annealing_dict_str |
| 234 | +annealing_dict = { |
| 235 | + (x.encode("ascii"), y.encode("ascii")): s.encode("ascii") |
| 236 | + for (x, y), s in annealing_dict_str.items() |
| 237 | +} |
| 238 | + |
| 239 | +dscode_sense = [] |
| 240 | +dscode_compl = [] |
| 241 | +watson = [] |
| 242 | +crick = [] |
| 243 | + |
| 244 | +for (w, c), s in bp_dict.items(): |
| 245 | + |
| 246 | + if w.isupper() and c.islower() or w.islower() and c.isupper(): |
| 247 | + continue |
| 248 | + |
| 249 | + dscode_sense.append(s) |
| 250 | + dscode_compl.append(bp_dict[c, w]) |
| 251 | + watson.append(w) |
| 252 | + crick.append(c) |
| 253 | + |
| 254 | +complement_table_dscode = bytes.maketrans( |
| 255 | + b"".join(dscode_sense), b"".join(dscode_compl) |
| 256 | +) |
| 257 | + |
| 258 | +placeholder1, placeholder2, interval, empty_bs = ( |
| 259 | + b"~", |
| 260 | + b"+", |
| 261 | + b".", |
| 262 | + emptyspace.encode("ascii"), |
| 263 | +) |
| 264 | + |
| 265 | +for bstring in placeholder1, placeholder2, interval: |
| 266 | + assert all(letter in not_dscode.encode("ascii") for letter in bstring) |
| 267 | + |
| 268 | +dscode_to_watson_table = bytes.maketrans( |
| 269 | + b"".join(dscode_sense) + placeholder1 + placeholder2, |
| 270 | + b"".join(watson) + empty_bs + interval, |
| 271 | +) |
| 272 | + |
| 273 | +dscode_to_crick_table = bytes.maketrans( |
| 274 | + b"".join(dscode_sense) + placeholder1 + placeholder2, |
| 275 | + b"".join(crick) + interval + empty_bs, |
| 276 | +) |
| 277 | + |
| 278 | + |
| 279 | +watson_tail_letter_dict = { |
| 280 | + (w.encode("ascii")): s.encode("ascii") |
| 281 | + for (w, c), s in codes["single_stranded_dna_rna"].items() |
| 282 | + if c.isspace() |
| 283 | +} |
| 284 | + |
| 285 | +from_letters = b"".join(watson_tail_letter_dict.keys()) |
| 286 | + |
| 287 | +to_letters = b"".join(watson_tail_letter_dict.values()) |
| 288 | + |
| 289 | +dscode_to_crick_tail_table = bytes.maketrans(from_letters, to_letters) |
| 290 | +# dscode_to_crick_tail_table = bytes.maketrans(b"GATCgatc", b"PEXIpexi") |
| 291 | + |
| 292 | + |
| 293 | +crick_tail_letter_dict = { |
| 294 | + (c.encode("ascii")): s.encode("ascii") |
| 295 | + for (w, c), s in codes["single_stranded_dna_rna"].items() |
| 296 | + if w.isspace() |
| 297 | +} |
| 298 | + |
| 299 | +from_letters = b"".join(crick_tail_letter_dict.keys()) |
| 300 | + |
| 301 | +to_letters = b"".join(crick_tail_letter_dict.values()) |
| 302 | + |
| 303 | +dscode_to_watson_tail_table = bytes.maketrans(from_letters, to_letters) |
| 304 | +dscode_to_watson_tail_table = bytes.maketrans(b"GATCgatc", b"QFZJqfzj") |
| 305 | + |
| 306 | + |
| 307 | +dscode_to_to_full_sequence_table = bytes.maketrans( |
| 308 | + b"PEXIpexiQFZJqfzj", b"GATCgatcGATCgatc" |
| 309 | +) |
| 310 | + |
| 311 | + |
| 312 | +iupac_compl_regex = { # IUPAC Ambiguity Code complements |
| 313 | + "A": "(?:T|U)", |
| 314 | + "C": "(?:G)", |
| 315 | + "G": "(?:C)", |
| 316 | + "T": "(?:A)", |
| 317 | + "U": "(?:A)", |
| 318 | + "R": "(?:T|C|Y)", |
| 319 | + "Y": "(?:G|A|R)", |
| 320 | + "S": "(?:G|C|S)", |
| 321 | + "W": "(?:A|T|W)", |
| 322 | + "K": "(?:C|AM)", |
| 323 | + "M": "(?:T|G|K)", |
| 324 | + "B": "(?:C|G|A|V)", |
| 325 | + "D": "(?:A|C|T|H)", |
| 326 | + "H": "(?:A|G|T|D)", |
| 327 | + "V": "(?:T|C|G|B)", |
| 328 | + "N": "(?:A|G|C|T|N)", |
| 329 | +} |
0 commit comments