Showing
4 changed files
with
361 additions
and
213 deletions
1 | -v 1.1 beta, January 2021 | 1 | +v 1.3 beta, January 2021 |
2 | 2 | ||
3 | The first uses of RNAnet by people from outside the development team happened between this December. | 3 | The first uses of RNAnet by people from outside the development team happened between this December. |
4 | A few feedback allowed to identify issues and useful information to add. | 4 | A few feedback allowed to identify issues and useful information to add. |
5 | 5 | ||
6 | FEATURE CHANGES | 6 | FEATURE CHANGES |
7 | - - Sequence alignments of the 3D structures mapped to a family are now provided. | 7 | + - Sequence alignments of the 3D structures mapped to a family are now provided. |
8 | - Full alignements with Rfam sequences are not provided, but you can ask us for the files. | 8 | - Full alignements with Rfam sequences are not provided, but you can ask us for the files. |
9 | - Two new fields in table 'family': ali_length and ali_filtered_length. | 9 | - Two new fields in table 'family': ali_length and ali_filtered_length. |
10 | They are the MSA lengths of the alignment with and without the Rfam sequences. | 10 | They are the MSA lengths of the alignment with and without the Rfam sequences. |
11 | + - Gap replacement by consensus (--fill-gaps) has been removed. Now, the gap percentage and consensus are saved | ||
12 | + in the align_column table and the datapoints in CSV format, in separate columns. | ||
13 | + Consensus is one of ACGUN-, the gap being chosen if >75% of the sequences are gaps at this position. | ||
14 | + Otherwise, A/C/G/U is chosen if >50% of the non-gap positions are A/C/G/U. Otherwise, N is the consensus. | ||
11 | 15 | ||
12 | TECHNICAL CHANGES | 16 | TECHNICAL CHANGES |
13 | - - SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode) | 17 | + - SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode, but this is useless) |
18 | + - Moved to Python3.9 for internal testing. | ||
19 | + - Latest version of BioPython is now supported (1.78) | ||
14 | 20 | ||
15 | BUG CORRECTIONS | 21 | BUG CORRECTIONS |
16 | - When an alignment file is updated in a newer run of RNANet, all the re_mappings are now re-computed | 22 | - When an alignment file is updated in a newer run of RNANet, all the re_mappings are now re-computed |
... | @@ -19,8 +25,8 @@ BUG CORRECTIONS | ... | @@ -19,8 +25,8 @@ BUG CORRECTIONS |
19 | - Changed the ownership and permissions of files produced by the Docker container. | 25 | - Changed the ownership and permissions of files produced by the Docker container. |
20 | They were previously owned by root and the user could not get access to them. | 26 | They were previously owned by root and the user could not get access to them. |
21 | - Modified nucleotides were not always correctly transformed to N in the alignments (and nucleotide.nt_align_code fields). | 27 | - Modified nucleotides were not always correctly transformed to N in the alignments (and nucleotide.nt_align_code fields). |
22 | - Now, the alignments and nt_align_code only contain "ACGUN-" chars. | 28 | + Now, the alignments and nt_align_code (and consensus) only contain "ACGUN-" chars. |
23 | - Now, 'N' means 'other', while '-' means 'nothing'. | 29 | + Now, 'N' means 'other', while '-' means 'nothing' or 'unknown'. |
24 | 30 | ||
25 | COMING SOON | 31 | COMING SOON |
26 | - Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ . | 32 | - Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ . | ... | ... |
... | @@ -14,7 +14,7 @@ RUN apk update && apk add --no-cache \ | ... | @@ -14,7 +14,7 @@ RUN apk update && apk add --no-cache \ |
14 | py3-matplotlib py3-requests py3-scipy py3-setproctitle py3-sqlalchemy py3-tqdm \ | 14 | py3-matplotlib py3-requests py3-scipy py3-setproctitle py3-sqlalchemy py3-tqdm \ |
15 | sqlite \ | 15 | sqlite \ |
16 | \ | 16 | \ |
17 | - && python3 -m pip install biopython==1.76 pandas psutil pymysql && \ | 17 | + && python3 -m pip install biopython pandas psutil pymysql && \ |
18 | \ | 18 | \ |
19 | wget -q -O /etc/apk/keys/sgerrand.rsa.pub https://alpine-pkgs.sgerrand.com/sgerrand.rsa.pub && \ | 19 | wget -q -O /etc/apk/keys/sgerrand.rsa.pub https://alpine-pkgs.sgerrand.com/sgerrand.rsa.pub && \ |
20 | wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.32-r0/glibc-2.32-r0.apk && \ | 20 | wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.32-r0/glibc-2.32-r0.apk && \ | ... | ... |
... | @@ -125,36 +125,6 @@ class SelectivePortionSelector(object): | ... | @@ -125,36 +125,6 @@ class SelectivePortionSelector(object): |
125 | return 1 | 125 | return 1 |
126 | 126 | ||
127 | 127 | ||
128 | -class BufferingSummaryInfo(AlignInfo.SummaryInfo): | ||
129 | - | ||
130 | - def get_pssm(self, family, index): | ||
131 | - """Create a position specific score matrix object for the alignment. | ||
132 | - | ||
133 | - This creates a position specific score matrix (pssm) which is an | ||
134 | - alternative method to look at a consensus sequence. | ||
135 | - | ||
136 | - Returns: | ||
137 | - - A PSSM (position specific score matrix) object. | ||
138 | - """ | ||
139 | - | ||
140 | - pssm_info = [] | ||
141 | - # now start looping through all of the sequences and getting info | ||
142 | - for residue_num in tqdm(range(self.alignment.get_alignment_length()), position=index+1, desc=f"Worker {index+1}: Count bases in fam {family}", leave=False): | ||
143 | - score_dict = self._get_base_letters("ACGUN") | ||
144 | - for record in self.alignment: | ||
145 | - this_residue = record.seq[residue_num].upper() | ||
146 | - if this_residue not in "-.": | ||
147 | - try: | ||
148 | - score_dict[this_residue] += 1.0 | ||
149 | - except KeyError: | ||
150 | - # if this_residue in "acgun": | ||
151 | - # warn(f"Found {this_residue} in {family} alignment...") | ||
152 | - score_dict[this_residue] = 1.0 | ||
153 | - pssm_info.append(('*', score_dict)) | ||
154 | - | ||
155 | - return AlignInfo.PSSM(pssm_info) | ||
156 | - | ||
157 | - | ||
158 | class Chain: | 128 | class Chain: |
159 | """ | 129 | """ |
160 | The object which stores all our data and the methods to process it. | 130 | The object which stores all our data and the methods to process it. |
... | @@ -963,7 +933,6 @@ class Pipeline: | ... | @@ -963,7 +933,6 @@ class Pipeline: |
963 | # Default options: | 933 | # Default options: |
964 | self.CRYSTAL_RES = 4.0 | 934 | self.CRYSTAL_RES = 4.0 |
965 | self.KEEP_HETATM = False | 935 | self.KEEP_HETATM = False |
966 | - self.FILL_GAPS = True | ||
967 | self.HOMOLOGY = True | 936 | self.HOMOLOGY = True |
968 | self.USE_KNOWN_ISSUES = True | 937 | self.USE_KNOWN_ISSUES = True |
969 | self.RUN_STATS = False | 938 | self.RUN_STATS = False |
... | @@ -984,11 +953,9 @@ class Pipeline: | ... | @@ -984,11 +953,9 @@ class Pipeline: |
984 | setproctitle("RNANet.py process_options()") | 953 | setproctitle("RNANet.py process_options()") |
985 | 954 | ||
986 | try: | 955 | try: |
987 | - opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", | 956 | + opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=", |
988 | - ["help", "resolution=", "keep-hetatm=", "from-scratch", "full-inference," | 957 | + "from-scratch", "full-inference", "no-homology", "ignore-issues", "extract", |
989 | - "fill-gaps=", "3d-folder=", "seq-folder=", | 958 | + "all", "no-logs", "archive", "update-homologous"]) |
990 | - "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs", | ||
991 | - "archive", "update-homologous"]) | ||
992 | except getopt.GetoptError as err: | 959 | except getopt.GetoptError as err: |
993 | print(err) | 960 | print(err) |
994 | sys.exit(2) | 961 | sys.exit(2) |
... | @@ -1014,8 +981,6 @@ class Pipeline: | ... | @@ -1014,8 +981,6 @@ class Pipeline: |
1014 | print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") | 981 | print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") |
1015 | print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. " | 982 | print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. " |
1016 | "\n\t\t\t\tDoes not affect the descriptors.") | 983 | "\n\t\t\t\tDoes not affect the descriptors.") |
1017 | - print("--fill-gaps=True\t\t(True | False) Replace gaps in nt_align_code field due to unresolved residues" | ||
1018 | - "\n\t\t\t\tby the most common nucleotide at this position in the alignment.") | ||
1019 | print("--3d-folder=…\t\t\tPath to a folder to store the 3D data files. Subfolders will contain:" | 984 | print("--3d-folder=…\t\t\tPath to a folder to store the 3D data files. Subfolders will contain:" |
1020 | "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format" | 985 | "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format" |
1021 | "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains" | 986 | "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains" |
... | @@ -1038,7 +1003,7 @@ class Pipeline: | ... | @@ -1038,7 +1003,7 @@ class Pipeline: |
1038 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") | 1003 | print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") |
1039 | sys.exit() | 1004 | sys.exit() |
1040 | elif opt == '--version': | 1005 | elif opt == '--version': |
1041 | - print("RNANet 1.2, parallelized, Dockerized") | 1006 | + print("RNANet 1.3 beta, parallelized, Dockerized") |
1042 | sys.exit() | 1007 | sys.exit() |
1043 | elif opt == "-r" or opt == "--resolution": | 1008 | elif opt == "-r" or opt == "--resolution": |
1044 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 1009 | assert float(arg) > 0.0 and float(arg) <= 20.0 |
... | @@ -1048,9 +1013,6 @@ class Pipeline: | ... | @@ -1048,9 +1013,6 @@ class Pipeline: |
1048 | elif opt == "--keep-hetatm": | 1013 | elif opt == "--keep-hetatm": |
1049 | assert arg in ["True", "False"] | 1014 | assert arg in ["True", "False"] |
1050 | self.KEEP_HETATM = (arg == "True") | 1015 | self.KEEP_HETATM = (arg == "True") |
1051 | - elif opt == "--fill-gaps": | ||
1052 | - assert arg in ["True", "False"] | ||
1053 | - self.FILL_GAPS = (arg == "True") | ||
1054 | elif opt == "--no-homology": | 1016 | elif opt == "--no-homology": |
1055 | self.HOMOLOGY = False | 1017 | self.HOMOLOGY = False |
1056 | elif opt == '--3d-folder': | 1018 | elif opt == '--3d-folder': |
... | @@ -1410,13 +1372,12 @@ class Pipeline: | ... | @@ -1410,13 +1372,12 @@ class Pipeline: |
1410 | 1372 | ||
1411 | # Start a process pool to dispatch the RNA families, | 1373 | # Start a process pool to dispatch the RNA families, |
1412 | # over multiple CPUs (one family by CPU) | 1374 | # over multiple CPUs (one family by CPU) |
1413 | - # p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1) | ||
1414 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) | 1375 | p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) |
1415 | 1376 | ||
1416 | try: | 1377 | try: |
1417 | fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) | 1378 | fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) |
1418 | # Apply work_pssm_remap to each RNA family | 1379 | # Apply work_pssm_remap to each RNA family |
1419 | - for i, _ in enumerate(p.imap_unordered(partial(work_pssm_remap, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): | 1380 | + for i, _ in enumerate(p.imap_unordered(work_pssm_remap, self.fam_list, chunksize=1)): |
1420 | # Everytime the iteration finishes on a family, update the global progress bar over the RNA families | 1381 | # Everytime the iteration finishes on a family, update the global progress bar over the RNA families |
1421 | fam_pbar.update(1) | 1382 | fam_pbar.update(1) |
1422 | fam_pbar.close() | 1383 | fam_pbar.close() |
... | @@ -1492,6 +1453,12 @@ class Pipeline: | ... | @@ -1492,6 +1453,12 @@ class Pipeline: |
1492 | subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."]) | 1453 | subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."]) |
1493 | subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"]) | 1454 | subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"]) |
1494 | 1455 | ||
1456 | + # gather the alignments | ||
1457 | + os.makedirs(path_to_seq_data + "realigned/3D_only", exist_ok=True) | ||
1458 | + subprocess.run(["cp", path_to_seq_data + "realigned/*_3d_only.afa", path_to_seq_data + "realigned/3d_only" ]) | ||
1459 | + subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"]) | ||
1460 | + subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."]) | ||
1461 | + | ||
1495 | def sanitize_database(self): | 1462 | def sanitize_database(self): |
1496 | """Searches for issues in the database and correct them""" | 1463 | """Searches for issues in the database and correct them""" |
1497 | 1464 | ||
... | @@ -1540,6 +1507,20 @@ class Pipeline: | ... | @@ -1540,6 +1507,20 @@ class Pipeline: |
1540 | # for x in r: | 1507 | # for x in r: |
1541 | # print(x) | 1508 | # print(x) |
1542 | 1509 | ||
1510 | + # check that filtered alignment have the same length than the number of saved alignment columns for a family | ||
1511 | + r = sql_ask_database(conn, """select family.rfam_acc, count, ali_filtered_len | ||
1512 | + FROM family | ||
1513 | + LEFT JOIN ( | ||
1514 | + SELECT rfam_acc, count(distinct index_ali) as count from align_column where index_ali>0 group by rfam_acc | ||
1515 | + ) AS s ON family.rfam_acc=s.rfam_acc;""") | ||
1516 | + for f in r: | ||
1517 | + if f[1] is None or f[2] is None: | ||
1518 | + warn(f"{f[0]} has incomplete alignement data: {f[1]} alignement columns saved, filtered alignment is of length {f[2]}") | ||
1519 | + continue | ||
1520 | + | ||
1521 | + if f[1] != f[2]: | ||
1522 | + warn(f"{f[0]} has {f[1]} alignement columns saved, but its filtered alignment is of length {f[2]} !") | ||
1523 | + | ||
1543 | conn.close() | 1524 | conn.close() |
1544 | 1525 | ||
1545 | 1526 | ||
... | @@ -1684,6 +1665,8 @@ def sql_define_tables(conn): | ... | @@ -1684,6 +1665,8 @@ def sql_define_tables(conn): |
1684 | freq_G REAL, | 1665 | freq_G REAL, |
1685 | freq_U REAL, | 1666 | freq_U REAL, |
1686 | freq_other REAL, | 1667 | freq_other REAL, |
1668 | + gap_percent REAL, | ||
1669 | + consensus CHAR(1), | ||
1687 | PRIMARY KEY (rfam_acc, index_ali), | 1670 | PRIMARY KEY (rfam_acc, index_ali), |
1688 | FOREIGN KEY(rfam_acc) REFERENCES family(rfam_acc) | 1671 | FOREIGN KEY(rfam_acc) REFERENCES family(rfam_acc) |
1689 | ); | 1672 | ); |
... | @@ -2158,7 +2141,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): | ... | @@ -2158,7 +2141,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): |
2158 | with open(fasta, 'w') as f: | 2141 | with open(fasta, 'w') as f: |
2159 | for rec in seqfile: | 2142 | for rec in seqfile: |
2160 | if rec.id not in doublons: | 2143 | if rec.id not in doublons: |
2161 | - f.write(rec.format("fasta")) | 2144 | + f.write(format(rec, "fasta")) |
2162 | 2145 | ||
2163 | # Add the new sequences with previous ones, if any | 2146 | # Add the new sequences with previous ones, if any |
2164 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f: | 2147 | with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f: |
... | @@ -2333,28 +2316,8 @@ def work_realign(rfam_acc): | ... | @@ -2333,28 +2316,8 @@ def work_realign(rfam_acc): |
2333 | er.write(f"Failed to realign {rfam_acc} (killed)") | 2316 | er.write(f"Failed to realign {rfam_acc} (killed)") |
2334 | 2317 | ||
2335 | 2318 | ||
2336 | -def summarize_position(counts): | ||
2337 | - """ Counts the number of nucleotides at a given position, given a "column" from a MSA. | ||
2338 | - """ | ||
2339 | - | ||
2340 | - # Count modified nucleotides | ||
2341 | - chars = counts.keys() | ||
2342 | - known_chars_count = 0 | ||
2343 | - N = 0 | ||
2344 | - for char in chars: | ||
2345 | - if char in "ACGU": | ||
2346 | - known_chars_count += counts[char] | ||
2347 | - if char not in ".-": | ||
2348 | - N += counts[char] # number of ungapped residues | ||
2349 | - | ||
2350 | - if N: # prevent division by zero if the column is only gaps | ||
2351 | - return (counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...) | ||
2352 | - else: | ||
2353 | - return (0, 0, 0, 0, 0) | ||
2354 | - | ||
2355 | - | ||
2356 | @trace_unhandled_exceptions | 2319 | @trace_unhandled_exceptions |
2357 | -def work_pssm_remap(f, fill_gaps): | 2320 | +def work_pssm_remap(f): |
2358 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. | 2321 | """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. |
2359 | This also remaps the 3D object sequence with the aligned sequence in the MSA. | 2322 | This also remaps the 3D object sequence with the aligned sequence in the MSA. |
2360 | If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing. | 2323 | If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing. |
... | @@ -2385,11 +2348,54 @@ def work_pssm_remap(f, fill_gaps): | ... | @@ -2385,11 +2348,54 @@ def work_pssm_remap(f, fill_gaps): |
2385 | with open(runDir + "/errors.txt", "a") as errf: | 2348 | with open(runDir + "/errors.txt", "a") as errf: |
2386 | errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n") | 2349 | errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n") |
2387 | return 1 | 2350 | return 1 |
2351 | + nseqs = len(align) | ||
2352 | + ncols = align.get_alignment_length() | ||
2388 | 2353 | ||
2389 | # Compute statistics per column | 2354 | # Compute statistics per column |
2390 | - pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx) | 2355 | + pssm_info = np.zeros((6, ncols)) |
2391 | - frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ] | 2356 | + res_index = {'A':0, 'C':1, 'G':2, 'U':3, 'N':4, '-':5} |
2392 | - del pssm | 2357 | + letters = "ACGUN" |
2358 | + consensus = [] | ||
2359 | + | ||
2360 | + for residue_num in tqdm(range(ncols), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Count bases in fam {f}", leave=False): | ||
2361 | + | ||
2362 | + # Count the bases (iterate lines) | ||
2363 | + for record in align: | ||
2364 | + letter = record.seq[residue_num].upper().replace('.','-') | ||
2365 | + try: | ||
2366 | + idx = res_index[letter] | ||
2367 | + except KeyError: | ||
2368 | + # warn(f"Unknown residue found in {family} family: {letter}", error=True) | ||
2369 | + # These are K, R, etc from Rfam. The RNANet sequences provided are pure ACGUN, but not the Rfam ones. | ||
2370 | + idx = 4 # consider it is N | ||
2371 | + pssm_info[idx,residue_num] += 1.0 | ||
2372 | + | ||
2373 | + # Get the number of non-gap nucleotides | ||
2374 | + N = 0 | ||
2375 | + for i in range(5): | ||
2376 | + N += pssm_info[i,residue_num] | ||
2377 | + | ||
2378 | + if N>0: | ||
2379 | + # Divide base counts by number of non-gaps | ||
2380 | + for i in range(5): | ||
2381 | + pssm_info[i,residue_num] /= N | ||
2382 | + | ||
2383 | + # last line is for the gap percentage (Ngaps/Nlines) | ||
2384 | + pssm_info[5,residue_num] /= nseqs | ||
2385 | + | ||
2386 | + # Define consensus base for this position: | ||
2387 | + if pssm_info[5,residue_num] > 0.7: | ||
2388 | + # gaps are in majority if over 75% (that's my definition) | ||
2389 | + consensus.append('-') | ||
2390 | + else: | ||
2391 | + idx = np.argmax(pssm_info[0:5,residue_num]) | ||
2392 | + if pssm_info[idx, residue_num] > 0.5: | ||
2393 | + consensus.append(letters[idx]) | ||
2394 | + else: | ||
2395 | + consensus.append('N') | ||
2396 | + | ||
2397 | + # At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars. | ||
2398 | + | ||
2393 | 2399 | ||
2394 | ########################################################################################## | 2400 | ########################################################################################## |
2395 | # Remap sequences of the 3D chains with sequences in the alignment | 2401 | # Remap sequences of the 3D chains with sequences in the alignment |
... | @@ -2397,60 +2403,51 @@ def work_pssm_remap(f, fill_gaps): | ... | @@ -2397,60 +2403,51 @@ def work_pssm_remap(f, fill_gaps): |
2397 | 2403 | ||
2398 | setproctitle(f"RNAnet.py work_pssm_remap({f}) remap") | 2404 | setproctitle(f"RNAnet.py work_pssm_remap({f}) remap") |
2399 | 2405 | ||
2400 | - # For each sequence, find the right chain and remap chain residues with alignment columns | 2406 | + # For each sequence, remap chain residues with sequence alignment |
2401 | columns_to_save = set() | 2407 | columns_to_save = set() |
2402 | re_mappings = [] | 2408 | re_mappings = [] |
2403 | - alilen = align.get_alignment_length() | 2409 | + pbar = tqdm(total=nseqs, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False) |
2404 | - pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False) | ||
2405 | pbar.update(0) | 2410 | pbar.update(0) |
2406 | for s in align: | 2411 | for s in align: |
2407 | - if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain | 2412 | + # skip Rfamseq entries |
2413 | + if not '[' in s.id: | ||
2408 | continue | 2414 | continue |
2409 | 2415 | ||
2410 | - # Check if the chain existed before in the database | 2416 | + # Get the chain id in the database |
2411 | - if s.id in chains_ids: | 2417 | + conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0) |
2412 | - # a chain object is found in the update, this sequence is new | 2418 | + conn.execute('pragma journal_mode=wal') |
2413 | - this_chain = list_of_chains[chains_ids.index(s.id)] | 2419 | + db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';") |
2414 | - seq_to_align = this_chain.seq_to_align | 2420 | + if len(db_id): |
2415 | - full_length = this_chain.full_length | 2421 | + db_id = db_id[0][0] |
2416 | - db_id = this_chain.db_chain_id | ||
2417 | else: | 2422 | else: |
2418 | - # it existed in the database before. | ||
2419 | - # Get the chain id in the database | ||
2420 | - conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0) | ||
2421 | - conn.execute('pragma journal_mode=wal') | ||
2422 | - db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';") | ||
2423 | - if len(db_id): | ||
2424 | - db_id = db_id[0][0] | ||
2425 | - else: | ||
2426 | - conn.close() | ||
2427 | - warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...") | ||
2428 | - pbar.update(1) | ||
2429 | - continue | ||
2430 | - seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")]) | ||
2431 | - full_length = len(seq_to_align) | ||
2432 | conn.close() | 2423 | conn.close() |
2424 | + warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...") | ||
2425 | + pbar.update(1) | ||
2426 | + continue | ||
2427 | + seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")]) | ||
2428 | + full_length = len(seq_to_align) | ||
2429 | + conn.close() | ||
2433 | 2430 | ||
2434 | # Save colums in the appropriate positions | 2431 | # Save colums in the appropriate positions |
2435 | i = 0 # to iterate the object sequence | 2432 | i = 0 # to iterate the object sequence |
2436 | j = 0 # to iterate the alignment sequence | 2433 | j = 0 # to iterate the alignment sequence |
2437 | - while i < full_length and j < alilen: | 2434 | + while i < full_length and j < ncols: |
2438 | # Here we try to map seq_to_align (the sequence of the 3D chain, including gaps when residues are missing), | 2435 | # Here we try to map seq_to_align (the sequence of the 3D chain, including gaps when residues are missing), |
2439 | # with s.seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and . | 2436 | # with s.seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and . |
2440 | 2437 | ||
2441 | - if seq_to_align[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps) | 2438 | + if seq_to_align[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps) |
2442 | re_mappings.append((db_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1. | 2439 | re_mappings.append((db_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1. |
2443 | columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored | 2440 | columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored |
2444 | i += 1 | 2441 | i += 1 |
2445 | j += 1 | 2442 | j += 1 |
2446 | - elif seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence | 2443 | + elif seq_to_align[i] == '-': # '-' in the chain, but '.' or letter in the aligned sequence |
2447 | # search for a gap to the consensus nearby | 2444 | # search for a gap to the consensus nearby |
2448 | k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.' | 2445 | k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.' |
2449 | - while j+k < alilen and s.seq[j+k] == '.': | 2446 | + while j+k < ncols and s.seq[j+k] == '.': |
2450 | k += 1 | 2447 | k += 1 |
2451 | 2448 | ||
2452 | # if found, set j to that position | 2449 | # if found, set j to that position |
2453 | - if j+k < alilen and s.seq[j+k] == '-': | 2450 | + if j+k < ncols and s.seq[j+k] == '-': |
2454 | re_mappings.append((db_id, i+1, j+k+1)) | 2451 | re_mappings.append((db_id, i+1, j+k+1)) |
2455 | columns_to_save.add(j+k+1) | 2452 | columns_to_save.add(j+k+1) |
2456 | i += 1 | 2453 | i += 1 |
... | @@ -2458,31 +2455,28 @@ def work_pssm_remap(f, fill_gaps): | ... | @@ -2458,31 +2455,28 @@ def work_pssm_remap(f, fill_gaps): |
2458 | continue | 2455 | continue |
2459 | 2456 | ||
2460 | # if not, take the insertion gap if this is one | 2457 | # if not, take the insertion gap if this is one |
2461 | - if j < alilen and s.seq[j] == '.': | 2458 | + if j < ncols and s.seq[j] == '.': |
2462 | re_mappings.append((db_id, i+1, j+1)) | 2459 | re_mappings.append((db_id, i+1, j+1)) |
2463 | columns_to_save.add(j+1) | 2460 | columns_to_save.add(j+1) |
2464 | i += 1 | 2461 | i += 1 |
2465 | j += 1 | 2462 | j += 1 |
2466 | continue | 2463 | continue |
2467 | 2464 | ||
2468 | - # else, just mark the gap as unknown (there is an alignment mismatch) | 2465 | + # else, just mark the gap as unknown (there is an alignment mismatch '-' in the 3D facing a letter in the alignment) |
2469 | re_mappings.append((db_id, i+1, 0)) | 2466 | re_mappings.append((db_id, i+1, 0)) |
2470 | i += 1 | 2467 | i += 1 |
2471 | elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain | 2468 | elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain |
2472 | j += 1 # ignore the column | 2469 | j += 1 # ignore the column |
2473 | else: # sequence mismatch which is not a gap... | 2470 | else: # sequence mismatch which is not a gap... |
2474 | - print(f"You are never supposed to reach this. Comparing {self.chain_label} in {i} ({self.seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).", | 2471 | + print(f"You are never supposed to reach this. Comparing {s.id} in {i} ({seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).", |
2475 | - self.seq_to_align, s.seq, sep='\n', flush=True) | 2472 | + seq_to_align, s.seq, sep='\n', flush=True) |
2476 | raise Exception('Something is wrong with sequence alignment.') | 2473 | raise Exception('Something is wrong with sequence alignment.') |
2477 | 2474 | ||
2478 | pbar.update(1) | 2475 | pbar.update(1) |
2479 | pbar.close() | 2476 | pbar.close() |
2480 | 2477 | ||
2481 | - # Check we found something | 2478 | + # Get a sorted list from the set |
2482 | - if not len(re_mappings): | 2479 | + columns = sorted(columns_to_save) |
2483 | - warn(f"Chains were not found in {f}++.afa file: {chains_ids}", error=True) | ||
2484 | - return 1 | ||
2485 | - | ||
2486 | 2480 | ||
2487 | ########################################################################################## | 2481 | ########################################################################################## |
2488 | # Save the alignment columns and their mappings to the database | 2482 | # Save the alignment columns and their mappings to the database |
... | @@ -2505,75 +2499,48 @@ def work_pssm_remap(f, fill_gaps): | ... | @@ -2505,75 +2499,48 @@ def work_pssm_remap(f, fill_gaps): |
2505 | if col not in columns_to_save: | 2499 | if col not in columns_to_save: |
2506 | unused.append((f, col)) | 2500 | unused.append((f, col)) |
2507 | sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused) | 2501 | sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused) |
2502 | + conn.commit() | ||
2503 | + | ||
2508 | # Save the useful columns in the database | 2504 | # Save the useful columns in the database |
2509 | - data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)] | 2505 | + data = [(f, j) + tuple(pssm_info[:,j-1]) + (consensus[j-1],) for j in sorted(columns_to_save)] |
2510 | - sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) | 2506 | + sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus) |
2511 | - VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO | 2507 | + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO |
2512 | - UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data) | 2508 | + UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, |
2513 | - # Add an unknown values column, with index_ali 0 | 2509 | + freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data) |
2514 | - sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) | 2510 | + # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment) |
2515 | - VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 1.0);""", data=(f,)) | 2511 | + sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus) |
2512 | + VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-');""", data=(f,)) | ||
2516 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) | 2513 | # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) |
2517 | sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) | 2514 | sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) |
2518 | conn.close() | 2515 | conn.close() |
2519 | 2516 | ||
2520 | ########################################################################################## | 2517 | ########################################################################################## |
2521 | - # Replacing gaps in the 3D chains by consensus sequences | 2518 | + # Saving the filtered alignement with only the saved positinos |
2522 | ########################################################################################## | 2519 | ########################################################################################## |
2523 | 2520 | ||
2524 | - setproctitle(f"RNAnet.py work_pssm_remap({f}) replace gaps") | 2521 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) filtering alignment") |
2525 | 2522 | ||
2526 | - # Replace gaps by consensus | 2523 | + # filter the alignment |
2527 | - if fill_gaps: | 2524 | + names = [ x.id for x in align if '[' in x.id ] |
2528 | - pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Replace {f} gaps", leave=False) | 2525 | + align = align[-len(names):] |
2529 | - pbar.update(0) | 2526 | + filtered_alignment = align[:, 1:1] # all the lines, but no columns |
2530 | - gaps = [] | 2527 | + for p in columns: |
2531 | - conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0) | 2528 | + filtered_alignment += align[:, p-1:p] # save columns one by one |
2532 | - conn.execute('pragma journal_mode=wal') | ||
2533 | - for s in align: | ||
2534 | - if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain | ||
2535 | - continue | ||
2536 | - | ||
2537 | - db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';") | ||
2538 | - if len(db_id): | ||
2539 | - db_id = db_id[0][0] | ||
2540 | - else: | ||
2541 | - pbar.update(1) | ||
2542 | - continue | ||
2543 | - seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ]) | ||
2544 | - aliseq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ]) | ||
2545 | - full_length = len(seq) | ||
2546 | - | ||
2547 | - # detect gaps | ||
2548 | - c_seq = list(seq) # contains "ACGUNacgu-" | ||
2549 | - letters = ['A', 'C', 'G', 'U', 'N'] | ||
2550 | - homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM | ||
2551 | - (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={db_id}) | ||
2552 | - NATURAL JOIN re_mapping | ||
2553 | - NATURAL JOIN align_column; | ||
2554 | - """) | ||
2555 | - if homology_data is None or not len(homology_data): | ||
2556 | - with open(runDir + "/errors.txt", "a") as errf: | ||
2557 | - errf.write(f"No homology data found in the database for {s.id} ! Not replacing gaps.\n") | ||
2558 | - continue | ||
2559 | - elif len(homology_data) != full_length: | ||
2560 | - with open(runDir + "/errors.txt", "a") as errf: | ||
2561 | - errf.write(f"Found {len(homology_data)} nucleotides for {s.id} of length {full_length} ! Not replacing gaps.\n") | ||
2562 | - continue | ||
2563 | - for i in range(full_length): | ||
2564 | - if c_seq[i] == '-': | ||
2565 | - freq = homology_data[i] | ||
2566 | - l = letters[freq.index(max(freq))] | ||
2567 | - gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', db_id, i+1)) | ||
2568 | - pbar.update(1) | ||
2569 | - sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?, | ||
2570 | - is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ? | ||
2571 | - WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps) | ||
2572 | - conn.close() | ||
2573 | - idxQueue.put(thr_idx) # replace the thread index in the queue | ||
2574 | 2529 | ||
2575 | - setproctitle(f"RNAnet.py work_pssm_remap({f}) finished") | 2530 | + # write it to file in both STK and FASTA formats (STK required for distance matrices in statistics) |
2531 | + with open(path_to_seq_data+f"/realigned/{f}_3d_only.stk", "w") as only_3d: | ||
2532 | + try: | ||
2533 | + only_3d.write(format(filtered_alignment, "stockholm")) | ||
2534 | + except ValueError as e: | ||
2535 | + warn(e) | ||
2536 | + with open(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "w") as only_3d: | ||
2537 | + try: | ||
2538 | + only_3d.write(format(filtered_alignment, "fasta")) | ||
2539 | + except ValueError as e: | ||
2540 | + warn(e) | ||
2576 | 2541 | ||
2542 | + setproctitle(f"RNAnet.py work_pssm_remap({f}) finished") | ||
2543 | + idxQueue.put(thr_idx) # replace the thread index in the queue | ||
2577 | return 0 | 2544 | return 0 |
2578 | 2545 | ||
2579 | 2546 | ||
... | @@ -2587,7 +2554,7 @@ def work_save(c, homology=True): | ... | @@ -2587,7 +2554,7 @@ def work_save(c, homology=True): |
2587 | if homology: | 2554 | if homology: |
2588 | df = pd.read_sql_query(f""" | 2555 | df = pd.read_sql_query(f""" |
2589 | SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, | 2556 | SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, |
2590 | - is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn, | 2557 | + is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, dbn, |
2591 | paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, | 2558 | paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, |
2592 | chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, | 2559 | chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, |
2593 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM | 2560 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM |
... | @@ -2631,9 +2598,9 @@ if __name__ == "__main__": | ... | @@ -2631,9 +2598,9 @@ if __name__ == "__main__": |
2631 | sql_define_tables(conn) | 2598 | sql_define_tables(conn) |
2632 | print("> Storing results into", runDir + "/results/RNANet.db") | 2599 | print("> Storing results into", runDir + "/results/RNANet.db") |
2633 | 2600 | ||
2634 | - # # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc). | 2601 | + # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc). |
2635 | - # # If --all was passed, all the structures are kept. | 2602 | + # If --all was passed, all the structures are kept. |
2636 | - # # Fills pp.update with Chain() objects. | 2603 | + # Fills pp.update with Chain() objects. |
2637 | # pp.list_available_mappings() | 2604 | # pp.list_available_mappings() |
2638 | 2605 | ||
2639 | # =========================================================================== | 2606 | # =========================================================================== |
... | @@ -2642,7 +2609,8 @@ if __name__ == "__main__": | ... | @@ -2642,7 +2609,8 @@ if __name__ == "__main__": |
2642 | 2609 | ||
2643 | # # Download and annotate new RNA 3D chains (Chain objects in pp.update) | 2610 | # # Download and annotate new RNA 3D chains (Chain objects in pp.update) |
2644 | # # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed. | 2611 | # # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed. |
2645 | - # pp.dl_and_annotate(coeff_ncores=0.5) | 2612 | + # # pp.dl_and_annotate(coeff_ncores=0.5) |
2613 | + # pp.dl_and_annotate(coeff_ncores=1.0) | ||
2646 | # print("Here we go.") | 2614 | # print("Here we go.") |
2647 | 2615 | ||
2648 | # # At this point, the structure table is up to date. | 2616 | # # At this point, the structure table is up to date. |
... | @@ -2710,7 +2678,7 @@ if __name__ == "__main__": | ... | @@ -2710,7 +2678,7 @@ if __name__ == "__main__": |
2710 | # Prepare the results | 2678 | # Prepare the results |
2711 | # ========================================================================================== | 2679 | # ========================================================================================== |
2712 | 2680 | ||
2713 | - pp.sanitize_database() | 2681 | + # pp.sanitize_database() |
2714 | pp.output_results() | 2682 | pp.output_results() |
2715 | 2683 | ||
2716 | print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove | 2684 | print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove | ... | ... |
... | @@ -4,7 +4,7 @@ | ... | @@ -4,7 +4,7 @@ |
4 | # Run this file if you want the base counts, pair-type counts, identity percents, etc | 4 | # Run this file if you want the base counts, pair-type counts, identity percents, etc |
5 | # in the database. | 5 | # in the database. |
6 | 6 | ||
7 | -import getopt, os, pickle, sqlite3, shlex, subprocess, sys | 7 | +import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings |
8 | import numpy as np | 8 | import numpy as np |
9 | import pandas as pd | 9 | import pandas as pd |
10 | import threading as th | 10 | import threading as th |
... | @@ -16,6 +16,7 @@ import scipy.cluster.hierarchy as sch | ... | @@ -16,6 +16,7 @@ import scipy.cluster.hierarchy as sch |
16 | from scipy.spatial.distance import squareform | 16 | from scipy.spatial.distance import squareform |
17 | from mpl_toolkits.mplot3d import axes3d | 17 | from mpl_toolkits.mplot3d import axes3d |
18 | from Bio import AlignIO, SeqIO | 18 | from Bio import AlignIO, SeqIO |
19 | +from Bio.PDB.MMCIFParser import MMCIFParser | ||
19 | from functools import partial | 20 | from functools import partial |
20 | from multiprocessing import Pool, Manager | 21 | from multiprocessing import Pool, Manager |
21 | from os import path | 22 | from os import path |
... | @@ -429,11 +430,7 @@ def parallel_stats_pairs(f): | ... | @@ -429,11 +430,7 @@ def parallel_stats_pairs(f): |
429 | @trace_unhandled_exceptions | 430 | @trace_unhandled_exceptions |
430 | def to_id_matrix(f): | 431 | def to_id_matrix(f): |
431 | """ | 432 | """ |
432 | - Extracts sequences of 3D chains from the family alignments to a distinct STK file, | 433 | + Runs esl-alipid on the filtered alignment to get an identity matrix. |
433 | - then runs esl-alipid on it to get an identity matrix. | ||
434 | - | ||
435 | - Side-effect : also produces the 3D_only family alignment as a separate file. | ||
436 | - So, we use this function to update 'ali_filtered_length' in the family table. | ||
437 | """ | 434 | """ |
438 | if path.isfile("data/"+f+".npy"): | 435 | if path.isfile("data/"+f+".npy"): |
439 | return 0 | 436 | return 0 |
... | @@ -444,35 +441,18 @@ def to_id_matrix(f): | ... | @@ -444,35 +441,18 @@ def to_id_matrix(f): |
444 | 441 | ||
445 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_id_matrix({f})") | 442 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_id_matrix({f})") |
446 | 443 | ||
447 | - # Prepare a file | 444 | + if not path.isfile(f"{path_to_seq_data}/realigned/{f}_3d_only.stk"): |
448 | - with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file: | 445 | + warn(f"File not found: {path_to_seq_data}/realigned/{f}_3d_only.stk") |
449 | - al = AlignIO.read(al_file, "fasta") | 446 | + align = AlignIO.read(f"{path_to_seq_data}/realigned/{f}_3d_only.stk", "stockholm") |
450 | - names = [ x.id for x in al if '[' in x.id ] | 447 | + names = [ x.id for x in align if '[' in x.id ] |
451 | - al = al[-len(names):] | ||
452 | - with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d: | ||
453 | - try: | ||
454 | - only_3d.write(al.format("stockholm")) | ||
455 | - except ValueError as e: | ||
456 | - warn(e) | ||
457 | - del al | ||
458 | - subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", # | ||
459 | - "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", # This run just deletes columns of gaps | ||
460 | - "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) # | ||
461 | - subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk", f + "_3d_only.stk"]) | ||
462 | - subprocess.run(["esl-reformat", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.afa", "afa", path_to_seq_data+f"/realigned/{f}_3d_only.stk"]) | ||
463 | - | ||
464 | - # Out-of-scope task : update the database with the length of the filtered alignment: | ||
465 | - align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta") | ||
466 | - with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | ||
467 | - conn.execute('pragma journal_mode=wal') | ||
468 | - sql_execute(conn, "UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=[align.get_alignment_length(), f]) | ||
469 | del align | 448 | del align |
470 | - | 449 | + |
450 | + pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False) | ||
451 | + pbar.update(0) | ||
452 | + | ||
471 | # Prepare the job | 453 | # Prepare the job |
472 | - process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), | 454 | + process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}/realigned/{f}_3d_only.stk"), stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
473 | - stdout=subprocess.PIPE, stderr=subprocess.PIPE) | ||
474 | id_matrix = np.zeros((len(names), len(names))) | 455 | id_matrix = np.zeros((len(names), len(names))) |
475 | - pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False) | ||
476 | cnt = 0 | 456 | cnt = 0 |
477 | while not cnt or process.poll() is None: | 457 | while not cnt or process.poll() is None: |
478 | output = process.stdout.read() | 458 | output = process.stdout.read() |
... | @@ -632,7 +612,6 @@ def stats_pairs(): | ... | @@ -632,7 +612,6 @@ def stats_pairs(): |
632 | plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) | 612 | plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) |
633 | plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png") | 613 | plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png") |
634 | 614 | ||
635 | - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | ||
636 | notify("Computed nucleotide statistics and saved CSV and PNG file.") | 615 | notify("Computed nucleotide statistics and saved CSV and PNG file.") |
637 | 616 | ||
638 | @trace_unhandled_exceptions | 617 | @trace_unhandled_exceptions |
... | @@ -931,7 +910,141 @@ def general_stats(): | ... | @@ -931,7 +910,141 @@ def general_stats(): |
931 | hspace=0.05, bottom=0.12, top=0.84) | 910 | hspace=0.05, bottom=0.12, top=0.84) |
932 | fig.savefig(runDir + "/results/figures/Nfamilies.png") | 911 | fig.savefig(runDir + "/results/figures/Nfamilies.png") |
933 | plt.close() | 912 | plt.close() |
913 | + | ||
914 | +def get_matrix_euclidian_distance(cif_file, aligned_seq, consider_all_atoms): | ||
915 | + """ | ||
916 | + This function | ||
917 | + - loads the coordinates and the alignment, reconctructs the alignment but with coordinates, considering gaps, and | ||
918 | + - compute the matrix of euclidian distances. | ||
919 | + | ||
920 | + Returns: | ||
921 | + The 2D np.array of euclidian distances between pairs of nucleotides, with np.NaNs in gap columns. | ||
922 | + """ | ||
923 | + # Load the baricenter coordinates | ||
924 | + coordinates = nt_3d_centers(cif_file, consider_all_atoms) | ||
925 | + | ||
926 | + # reconstruct the alignment but with coordinates | ||
927 | + nb_gap = 0 | ||
928 | + coordinates_with_gaps = [] | ||
929 | + for i in range(len(aligned_seq)): | ||
930 | + if aligned_seq[i] == '.' or aligned_seq[i] == '-': | ||
931 | + nb_gap = nb_gap + 1 | ||
932 | + coordinates_with_gaps.append('NA') | ||
933 | + else: | ||
934 | + coordinates_with_gaps.append(coordinates[i - nb_gap]) | ||
935 | + | ||
936 | + nb_nucleotides = len(coordinates_with_gaps) # number of nucleotides | ||
937 | + matrix = np.zeros((nb_nucleotides, nb_nucleotides)) # create a new empty matrix of size nxn | ||
938 | + | ||
939 | + # Fill this new matrix with the euclidians distances between all amino acids considering gaps: | ||
940 | + for i in range(nb_nucleotides): | ||
941 | + for j in range(nb_nucleotides): | ||
942 | + if coordinates_with_gaps[i] == 'NA' or coordinates_with_gaps[j] == 'NA': | ||
943 | + matrix[i][j] = np.nan | ||
944 | + else: | ||
945 | + matrix[i][j] = round(get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]),3) | ||
946 | + return(matrix) | ||
947 | + | ||
948 | +@trace_unhandled_exceptions | ||
949 | +def get_avg_std_distance_matrix(f, consider_all_atoms): | ||
950 | + # Get a worker number to position the progress bar | ||
951 | + global idxQueue | ||
952 | + thr_idx = idxQueue.get() | ||
953 | + | ||
954 | + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} residue distance matrices") | ||
955 | + | ||
956 | + if consider_all_atoms: | ||
957 | + label = "base" | ||
958 | + else: | ||
959 | + label = "backbone" | ||
960 | + | ||
961 | + os.makedirs(runDir + '/results/distance_matrices/' + f + '_' + label, exist_ok=True ) | ||
962 | + | ||
963 | + | ||
964 | + family_matrices = [] | ||
965 | + align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta") | ||
966 | + found = 0 | ||
967 | + notfound = 0 | ||
968 | + pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False) | ||
969 | + pbar.update(0) | ||
970 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | ||
971 | + conn.execute('pragma journal_mode=wal') | ||
972 | + r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';") | ||
973 | + filelist = [ ''.join(list(x))+'.cif' for x in r ] | ||
974 | + | ||
975 | + for s in align: | ||
976 | + filename = '' | ||
977 | + for file in filelist: | ||
978 | + if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')): | ||
979 | + filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file | ||
980 | + break | ||
981 | + if len(filename): | ||
982 | + found += 1 | ||
983 | + try: | ||
984 | + euclidian_distance = get_matrix_euclidian_distance(filename, s.seq, consider_all_atoms) | ||
985 | + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', euclidian_distance, delimiter=",", fmt="%.3f") | ||
986 | + family_matrices.append(euclidian_distance) | ||
987 | + except FileNotFoundError: | ||
988 | + found -= 1 | ||
989 | + notfound += 1 | ||
990 | + else: | ||
991 | + notfound += 1 | ||
992 | + pbar.update(1) | ||
993 | + | ||
994 | + # Calculation of the average matrix | ||
995 | + avgarray = np.array(family_matrices) | ||
996 | + if len(avgarray) == 0 or np.prod(avgarray.shape) == 0: | ||
997 | + warn(f"Something's wrong with the shapes: {avgarray.shape}", error=True) | ||
998 | + with warnings.catch_warnings(): | ||
999 | + warnings.simplefilter("ignore", category=RuntimeWarning) | ||
1000 | + matrix_average_distances = np.nanmean(avgarray, axis=0 ) | ||
1001 | + | ||
1002 | + if len(matrix_average_distances) != 0: | ||
1003 | + matrix_average_distances = np.nan_to_num(matrix_average_distances) | ||
1004 | + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , np.triu(matrix_average_distances), delimiter=",", fmt="%.3f") | ||
1005 | + | ||
1006 | + fig, ax = plt.subplots() | ||
1007 | + im = ax.imshow(matrix_average_distances) | ||
1008 | + cbar = ax.figure.colorbar(im, ax=ax) | ||
1009 | + cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom") | ||
1010 | + ax.set_title("Average distance between residues (Angströms)") | ||
1011 | + fig.tight_layout() | ||
1012 | + fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.png', dpi=300) | ||
1013 | + plt.close() | ||
1014 | + | ||
1015 | + # Calculation of the standard deviation matrix | ||
1016 | + with warnings.catch_warnings(): | ||
1017 | + warnings.simplefilter("ignore", category=RuntimeWarning) | ||
1018 | + matrix_standard_deviation_distances = np.nanstd(avgarray, axis=0 ) | ||
1019 | + | ||
1020 | + if len(matrix_standard_deviation_distances) != 0: | ||
1021 | + matrix_standard_deviation_distances = np.nan_to_num(matrix_standard_deviation_distances) | ||
1022 | + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , np.triu(matrix_standard_deviation_distances), delimiter=",", fmt="%.3f") | ||
1023 | + | ||
1024 | + fig, ax = plt.subplots() | ||
1025 | + im = ax.imshow(matrix_standard_deviation_distances) | ||
1026 | + cbar = ax.figure.colorbar(im, ax=ax) | ||
1027 | + cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom") | ||
1028 | + ax.set_title("Average distance between residues (Angströms)") | ||
1029 | + fig.tight_layout() | ||
1030 | + fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_std.png', dpi=300) | ||
1031 | + plt.close() | ||
1032 | + | ||
1033 | + # Save log | ||
1034 | + with open(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '.log', 'a') as logfile: | ||
1035 | + logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found.\n") | ||
1036 | + | ||
1037 | + # Save associated nucleotide frequencies (off-topic but convenient to do it here) | ||
1038 | + with sqlite3.connect(runDir + "/results/RNANet.db") as conn: | ||
1039 | + conn.execute('pragma journal_mode=wal') | ||
1040 | + df = pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;", conn) | ||
1041 | + df.to_csv(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_frequencies.csv', float_format="%.3f") | ||
1042 | + | ||
1043 | + pbar.close() | ||
1044 | + | ||
1045 | + idxQueue.put(thr_idx) # replace the thread index in the queue | ||
934 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") | 1046 | setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") |
1047 | + return 0 | ||
935 | 1048 | ||
936 | def log_to_pbar(pbar): | 1049 | def log_to_pbar(pbar): |
937 | def update(r): | 1050 | def update(r): |
... | @@ -952,6 +1065,48 @@ def family_order(f): | ... | @@ -952,6 +1065,48 @@ def family_order(f): |
952 | else: | 1065 | else: |
953 | return 2 | 1066 | return 2 |
954 | 1067 | ||
1068 | +def nt_3d_centers(cif_file, consider_all_atoms): | ||
1069 | + """Return the nucleotides' coordinates, summarizing a nucleotide by only one point. | ||
1070 | + If consider_all_atoms : barycentre is used | ||
1071 | + else: C1' atom is the nucleotide | ||
1072 | + """ | ||
1073 | + result =[] | ||
1074 | + structure = MMCIFParser().get_structure(cif_file, cif_file) | ||
1075 | + | ||
1076 | + if consider_all_atoms == True: | ||
1077 | + for model in structure: | ||
1078 | + for chain in model: | ||
1079 | + for residue in chain: | ||
1080 | + temp_list = [] | ||
1081 | + res_isobaricentre = 0 | ||
1082 | + for atom in residue: | ||
1083 | + temp_list.append(atom.get_coord()) | ||
1084 | + lg = len(temp_list) | ||
1085 | + | ||
1086 | + summ = np.sum(temp_list, axis = 0) | ||
1087 | + res_isobaricentre = [summ[0]/lg, summ[1]/lg, summ[2]/lg] | ||
1088 | + result.append([res_isobaricentre[0], res_isobaricentre[1], res_isobaricentre[2]]) | ||
1089 | + | ||
1090 | + elif consider_all_atoms == False: | ||
1091 | + for model in structure: | ||
1092 | + for chain in model: | ||
1093 | + for residue in chain: | ||
1094 | + for atom in residue: | ||
1095 | + if atom.get_name() == "C1'": | ||
1096 | + coordinates = atom.get_coord() | ||
1097 | + res = [coordinates[0], coordinates[1], coordinates[2]] | ||
1098 | + result.append(res) | ||
1099 | + return(result) | ||
1100 | + | ||
1101 | +def get_euclidian_distance(L1, L2): | ||
1102 | + """Returns the distance between two points (coordinates in lists) | ||
1103 | + """ | ||
1104 | + | ||
1105 | + e = 0 | ||
1106 | + for i in range(len(L1)): | ||
1107 | + e += float(L1[i] - L2[i])**2 | ||
1108 | + return np.sqrt(e) | ||
1109 | + | ||
955 | if __name__ == "__main__": | 1110 | if __name__ == "__main__": |
956 | 1111 | ||
957 | os.makedirs(runDir + "/results/figures/", exist_ok=True) | 1112 | os.makedirs(runDir + "/results/figures/", exist_ok=True) |
... | @@ -959,8 +1114,9 @@ if __name__ == "__main__": | ... | @@ -959,8 +1114,9 @@ if __name__ == "__main__": |
959 | # parse options | 1114 | # parse options |
960 | DELETE_OLD_DATA = False | 1115 | DELETE_OLD_DATA = False |
961 | DO_WADLEY_ANALYSIS = False | 1116 | DO_WADLEY_ANALYSIS = False |
1117 | + DO_AVG_DISTANCE_MATRIX = False | ||
962 | try: | 1118 | try: |
963 | - opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ]) | 1119 | + opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "distance-matrices", "resolution=", "3d-folder=", "seq-folder=" ]) |
964 | except getopt.GetoptError as err: | 1120 | except getopt.GetoptError as err: |
965 | print(err) | 1121 | print(err) |
966 | sys.exit(2) | 1122 | sys.exit(2) |
... | @@ -979,9 +1135,12 @@ if __name__ == "__main__": | ... | @@ -979,9 +1135,12 @@ if __name__ == "__main__": |
979 | print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:" | 1135 | print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:" |
980 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") | 1136 | "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") |
981 | print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything") | 1137 | print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything") |
1138 | + print("--distance-matrices\t\tCompute average distance between nucleotide pairs for each family.") | ||
1139 | + print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.") | ||
1140 | + | ||
982 | sys.exit() | 1141 | sys.exit() |
983 | elif opt == '--version': | 1142 | elif opt == '--version': |
984 | - print("RNANet statistics 1.2") | 1143 | + print("RNANet statistics 1.3 beta") |
985 | sys.exit() | 1144 | sys.exit() |
986 | elif opt == "-r" or opt == "--resolution": | 1145 | elif opt == "-r" or opt == "--resolution": |
987 | assert float(arg) > 0.0 and float(arg) <= 20.0 | 1146 | assert float(arg) > 0.0 and float(arg) <= 20.0 |
... | @@ -997,6 +1156,8 @@ if __name__ == "__main__": | ... | @@ -997,6 +1156,8 @@ if __name__ == "__main__": |
997 | elif opt=='--from-scratch': | 1156 | elif opt=='--from-scratch': |
998 | DELETE_OLD_DATA = True | 1157 | DELETE_OLD_DATA = True |
999 | DO_WADLEY_ANALYSIS = True | 1158 | DO_WADLEY_ANALYSIS = True |
1159 | + elif opt=="--distance-matrices": | ||
1160 | + DO_AVG_DISTANCE_MATRIX = True | ||
1000 | elif opt=='--wadley': | 1161 | elif opt=='--wadley': |
1001 | DO_WADLEY_ANALYSIS = True | 1162 | DO_WADLEY_ANALYSIS = True |
1002 | 1163 | ||
... | @@ -1030,6 +1191,8 @@ if __name__ == "__main__": | ... | @@ -1030,6 +1191,8 @@ if __name__ == "__main__": |
1030 | subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) | 1191 | subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) |
1031 | if DO_WADLEY_ANALYSIS: | 1192 | if DO_WADLEY_ANALYSIS: |
1032 | subprocess.run(["rm","-f", runDir + f"/data/wadley_kernel_eta_{res_thr}.npz", runDir + f"/data/wadley_kernel_eta_prime_{res_thr}.npz", runDir + f"/data/pair_counts_{res_thr}.csv"]) | 1193 | subprocess.run(["rm","-f", runDir + f"/data/wadley_kernel_eta_{res_thr}.npz", runDir + f"/data/wadley_kernel_eta_prime_{res_thr}.npz", runDir + f"/data/pair_counts_{res_thr}.csv"]) |
1194 | + if DO_AVG_DISTANCE_MATRIX: | ||
1195 | + subprocess.run(["rm", "-rf", runDir + f"/results/distance_matrices/"]) | ||
1033 | 1196 | ||
1034 | # Prepare the multiprocessing execution environment | 1197 | # Prepare the multiprocessing execution environment |
1035 | nworkers = min(read_cpu_number()-1, 32) | 1198 | nworkers = min(read_cpu_number()-1, 32) |
... | @@ -1043,6 +1206,17 @@ if __name__ == "__main__": | ... | @@ -1043,6 +1206,17 @@ if __name__ == "__main__": |
1043 | if n_unmapped_chains and DO_WADLEY_ANALYSIS: | 1206 | if n_unmapped_chains and DO_WADLEY_ANALYSIS: |
1044 | joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) | 1207 | joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) |
1045 | joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) | 1208 | joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) |
1209 | + if DO_AVG_DISTANCE_MATRIX: | ||
1210 | + extracted_chains = [] | ||
1211 | + for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"): | ||
1212 | + if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)): | ||
1213 | + e1 = file.split('_')[0] | ||
1214 | + e2 = file.split('_')[1] | ||
1215 | + e3 = file.split('_')[2] | ||
1216 | + extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3) | ||
1217 | + for f in famlist: | ||
1218 | + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True))) | ||
1219 | + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False))) | ||
1046 | joblist.append(Job(function=stats_len)) # Computes figures | 1220 | joblist.append(Job(function=stats_len)) # Computes figures |
1047 | joblist.append(Job(function=stats_freq)) # updates the database | 1221 | joblist.append(Job(function=stats_freq)) # updates the database |
1048 | for f in famlist: | 1222 | for f in famlist: | ... | ... |
-
Please register or login to post a comment