Louis BECQUEY

v1.3 large update

1 -v 1.1 beta, January 2021 1 +v 1.3 beta, January 2021
2 2
3 The first uses of RNAnet by people from outside the development team happened between this December. 3 The first uses of RNAnet by people from outside the development team happened between this December.
4 A few feedback allowed to identify issues and useful information to add. 4 A few feedback allowed to identify issues and useful information to add.
5 5
6 FEATURE CHANGES 6 FEATURE CHANGES
7 - - Sequence alignments of the 3D structures mapped to a family are now provided. 7 + - Sequence alignments of the 3D structures mapped to a family are now provided.
8 - Full alignements with Rfam sequences are not provided, but you can ask us for the files. 8 - Full alignements with Rfam sequences are not provided, but you can ask us for the files.
9 - Two new fields in table 'family': ali_length and ali_filtered_length. 9 - Two new fields in table 'family': ali_length and ali_filtered_length.
10 They are the MSA lengths of the alignment with and without the Rfam sequences. 10 They are the MSA lengths of the alignment with and without the Rfam sequences.
11 + - Gap replacement by consensus (--fill-gaps) has been removed. Now, the gap percentage and consensus are saved
12 + in the align_column table and the datapoints in CSV format, in separate columns.
13 + Consensus is one of ACGUN-, the gap being chosen if >75% of the sequences are gaps at this position.
14 + Otherwise, A/C/G/U is chosen if >50% of the non-gap positions are A/C/G/U. Otherwise, N is the consensus.
11 15
12 TECHNICAL CHANGES 16 TECHNICAL CHANGES
13 - - SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode) 17 + - SQLite connexions are now all in WAL mode by default (previously, only the writers used WAL mode, but this is useless)
18 + - Moved to Python3.9 for internal testing.
19 + - Latest version of BioPython is now supported (1.78)
14 20
15 BUG CORRECTIONS 21 BUG CORRECTIONS
16 - When an alignment file is updated in a newer run of RNANet, all the re_mappings are now re-computed 22 - When an alignment file is updated in a newer run of RNANet, all the re_mappings are now re-computed
...@@ -19,8 +25,8 @@ BUG CORRECTIONS ...@@ -19,8 +25,8 @@ BUG CORRECTIONS
19 - Changed the ownership and permissions of files produced by the Docker container. 25 - Changed the ownership and permissions of files produced by the Docker container.
20 They were previously owned by root and the user could not get access to them. 26 They were previously owned by root and the user could not get access to them.
21 - Modified nucleotides were not always correctly transformed to N in the alignments (and nucleotide.nt_align_code fields). 27 - Modified nucleotides were not always correctly transformed to N in the alignments (and nucleotide.nt_align_code fields).
22 - Now, the alignments and nt_align_code only contain "ACGUN-" chars. 28 + Now, the alignments and nt_align_code (and consensus) only contain "ACGUN-" chars.
23 - Now, 'N' means 'other', while '-' means 'nothing'. 29 + Now, 'N' means 'other', while '-' means 'nothing' or 'unknown'.
24 30
25 COMING SOON 31 COMING SOON
26 - Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ . 32 - Automated annotation of detected Recurrent Interaction Networks (RINs), see http://carnaval.lri.fr/ .
......
...@@ -14,7 +14,7 @@ RUN apk update && apk add --no-cache \ ...@@ -14,7 +14,7 @@ RUN apk update && apk add --no-cache \
14 py3-matplotlib py3-requests py3-scipy py3-setproctitle py3-sqlalchemy py3-tqdm \ 14 py3-matplotlib py3-requests py3-scipy py3-setproctitle py3-sqlalchemy py3-tqdm \
15 sqlite \ 15 sqlite \
16 \ 16 \
17 - && python3 -m pip install biopython==1.76 pandas psutil pymysql && \ 17 + && python3 -m pip install biopython pandas psutil pymysql && \
18 \ 18 \
19 wget -q -O /etc/apk/keys/sgerrand.rsa.pub https://alpine-pkgs.sgerrand.com/sgerrand.rsa.pub && \ 19 wget -q -O /etc/apk/keys/sgerrand.rsa.pub https://alpine-pkgs.sgerrand.com/sgerrand.rsa.pub && \
20 wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.32-r0/glibc-2.32-r0.apk && \ 20 wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.32-r0/glibc-2.32-r0.apk && \
......
...@@ -125,36 +125,6 @@ class SelectivePortionSelector(object): ...@@ -125,36 +125,6 @@ class SelectivePortionSelector(object):
125 return 1 125 return 1
126 126
127 127
128 -class BufferingSummaryInfo(AlignInfo.SummaryInfo):
129 -
130 - def get_pssm(self, family, index):
131 - """Create a position specific score matrix object for the alignment.
132 -
133 - This creates a position specific score matrix (pssm) which is an
134 - alternative method to look at a consensus sequence.
135 -
136 - Returns:
137 - - A PSSM (position specific score matrix) object.
138 - """
139 -
140 - pssm_info = []
141 - # now start looping through all of the sequences and getting info
142 - for residue_num in tqdm(range(self.alignment.get_alignment_length()), position=index+1, desc=f"Worker {index+1}: Count bases in fam {family}", leave=False):
143 - score_dict = self._get_base_letters("ACGUN")
144 - for record in self.alignment:
145 - this_residue = record.seq[residue_num].upper()
146 - if this_residue not in "-.":
147 - try:
148 - score_dict[this_residue] += 1.0
149 - except KeyError:
150 - # if this_residue in "acgun":
151 - # warn(f"Found {this_residue} in {family} alignment...")
152 - score_dict[this_residue] = 1.0
153 - pssm_info.append(('*', score_dict))
154 -
155 - return AlignInfo.PSSM(pssm_info)
156 -
157 -
158 class Chain: 128 class Chain:
159 """ 129 """
160 The object which stores all our data and the methods to process it. 130 The object which stores all our data and the methods to process it.
...@@ -963,7 +933,6 @@ class Pipeline: ...@@ -963,7 +933,6 @@ class Pipeline:
963 # Default options: 933 # Default options:
964 self.CRYSTAL_RES = 4.0 934 self.CRYSTAL_RES = 4.0
965 self.KEEP_HETATM = False 935 self.KEEP_HETATM = False
966 - self.FILL_GAPS = True
967 self.HOMOLOGY = True 936 self.HOMOLOGY = True
968 self.USE_KNOWN_ISSUES = True 937 self.USE_KNOWN_ISSUES = True
969 self.RUN_STATS = False 938 self.RUN_STATS = False
...@@ -984,11 +953,9 @@ class Pipeline: ...@@ -984,11 +953,9 @@ class Pipeline:
984 setproctitle("RNANet.py process_options()") 953 setproctitle("RNANet.py process_options()")
985 954
986 try: 955 try:
987 - opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", 956 + opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", "only=",
988 - ["help", "resolution=", "keep-hetatm=", "from-scratch", "full-inference," 957 + "from-scratch", "full-inference", "no-homology", "ignore-issues", "extract",
989 - "fill-gaps=", "3d-folder=", "seq-folder=", 958 + "all", "no-logs", "archive", "update-homologous"])
990 - "no-homology", "ignore-issues", "extract", "only=", "all", "no-logs",
991 - "archive", "update-homologous"])
992 except getopt.GetoptError as err: 959 except getopt.GetoptError as err:
993 print(err) 960 print(err)
994 sys.exit(2) 961 sys.exit(2)
...@@ -1014,8 +981,6 @@ class Pipeline: ...@@ -1014,8 +981,6 @@ class Pipeline:
1014 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.") 981 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
1015 print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. " 982 print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
1016 "\n\t\t\t\tDoes not affect the descriptors.") 983 "\n\t\t\t\tDoes not affect the descriptors.")
1017 - print("--fill-gaps=True\t\t(True | False) Replace gaps in nt_align_code field due to unresolved residues"
1018 - "\n\t\t\t\tby the most common nucleotide at this position in the alignment.")
1019 print("--3d-folder=…\t\t\tPath to a folder to store the 3D data files. Subfolders will contain:" 984 print("--3d-folder=…\t\t\tPath to a folder to store the 3D data files. Subfolders will contain:"
1020 "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format" 985 "\n\t\t\t\t\tRNAcifs/\t\tFull structures containing RNA, in mmCIF format"
1021 "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains" 986 "\n\t\t\t\t\trna_mapped_to_Rfam/\tExtracted 'pure' RNA chains"
...@@ -1038,7 +1003,7 @@ class Pipeline: ...@@ -1038,7 +1003,7 @@ class Pipeline:
1038 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &") 1003 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &")
1039 sys.exit() 1004 sys.exit()
1040 elif opt == '--version': 1005 elif opt == '--version':
1041 - print("RNANet 1.2, parallelized, Dockerized") 1006 + print("RNANet 1.3 beta, parallelized, Dockerized")
1042 sys.exit() 1007 sys.exit()
1043 elif opt == "-r" or opt == "--resolution": 1008 elif opt == "-r" or opt == "--resolution":
1044 assert float(arg) > 0.0 and float(arg) <= 20.0 1009 assert float(arg) > 0.0 and float(arg) <= 20.0
...@@ -1048,9 +1013,6 @@ class Pipeline: ...@@ -1048,9 +1013,6 @@ class Pipeline:
1048 elif opt == "--keep-hetatm": 1013 elif opt == "--keep-hetatm":
1049 assert arg in ["True", "False"] 1014 assert arg in ["True", "False"]
1050 self.KEEP_HETATM = (arg == "True") 1015 self.KEEP_HETATM = (arg == "True")
1051 - elif opt == "--fill-gaps":
1052 - assert arg in ["True", "False"]
1053 - self.FILL_GAPS = (arg == "True")
1054 elif opt == "--no-homology": 1016 elif opt == "--no-homology":
1055 self.HOMOLOGY = False 1017 self.HOMOLOGY = False
1056 elif opt == '--3d-folder': 1018 elif opt == '--3d-folder':
...@@ -1410,13 +1372,12 @@ class Pipeline: ...@@ -1410,13 +1372,12 @@ class Pipeline:
1410 1372
1411 # Start a process pool to dispatch the RNA families, 1373 # Start a process pool to dispatch the RNA families,
1412 # over multiple CPUs (one family by CPU) 1374 # over multiple CPUs (one family by CPU)
1413 - # p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=1)
1414 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers) 1375 p = Pool(initializer=init_worker, initargs=(tqdm.get_lock(),), processes=nworkers)
1415 1376
1416 try: 1377 try:
1417 fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True) 1378 fam_pbar = tqdm(total=len(self.fam_list), desc="RNA families", position=0, leave=True)
1418 # Apply work_pssm_remap to each RNA family 1379 # Apply work_pssm_remap to each RNA family
1419 - for i, _ in enumerate(p.imap_unordered(partial(work_pssm_remap, fill_gaps=self.FILL_GAPS), self.fam_list, chunksize=1)): 1380 + for i, _ in enumerate(p.imap_unordered(work_pssm_remap, self.fam_list, chunksize=1)):
1420 # Everytime the iteration finishes on a family, update the global progress bar over the RNA families 1381 # Everytime the iteration finishes on a family, update the global progress bar over the RNA families
1421 fam_pbar.update(1) 1382 fam_pbar.update(1)
1422 fam_pbar.close() 1383 fam_pbar.close()
...@@ -1492,6 +1453,12 @@ class Pipeline: ...@@ -1492,6 +1453,12 @@ class Pipeline:
1492 subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."]) 1453 subprocess.run(["tar", "-C", path_to_3D_data + "/datapoints", "-czf", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", "."])
1493 subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"]) 1454 subprocess.run(["ln", "-s", runDir + f"/archive/RNANET_datapoints_{datestr}.tar.gz", runDir + f"/archive/RNANET_datapoints_latest.tar.gz"])
1494 1455
1456 + # gather the alignments
1457 + os.makedirs(path_to_seq_data + "realigned/3D_only", exist_ok=True)
1458 + subprocess.run(["cp", path_to_seq_data + "realigned/*_3d_only.afa", path_to_seq_data + "realigned/3d_only" ])
1459 + subprocess.run(["rm", "-f", runDir + f"/archive/RNANET_alignments_latest.tar.gz"])
1460 + subprocess.run(["tar", "-C", path_to_seq_data + "realigned/3d_only" , "-czf", runDir + f"/archive/RNANET_alignments_latest.tar.gz", "."])
1461 +
1495 def sanitize_database(self): 1462 def sanitize_database(self):
1496 """Searches for issues in the database and correct them""" 1463 """Searches for issues in the database and correct them"""
1497 1464
...@@ -1540,6 +1507,20 @@ class Pipeline: ...@@ -1540,6 +1507,20 @@ class Pipeline:
1540 # for x in r: 1507 # for x in r:
1541 # print(x) 1508 # print(x)
1542 1509
1510 + # check that filtered alignment have the same length than the number of saved alignment columns for a family
1511 + r = sql_ask_database(conn, """select family.rfam_acc, count, ali_filtered_len
1512 + FROM family
1513 + LEFT JOIN (
1514 + SELECT rfam_acc, count(distinct index_ali) as count from align_column where index_ali>0 group by rfam_acc
1515 + ) AS s ON family.rfam_acc=s.rfam_acc;""")
1516 + for f in r:
1517 + if f[1] is None or f[2] is None:
1518 + warn(f"{f[0]} has incomplete alignement data: {f[1]} alignement columns saved, filtered alignment is of length {f[2]}")
1519 + continue
1520 +
1521 + if f[1] != f[2]:
1522 + warn(f"{f[0]} has {f[1]} alignement columns saved, but its filtered alignment is of length {f[2]} !")
1523 +
1543 conn.close() 1524 conn.close()
1544 1525
1545 1526
...@@ -1684,6 +1665,8 @@ def sql_define_tables(conn): ...@@ -1684,6 +1665,8 @@ def sql_define_tables(conn):
1684 freq_G REAL, 1665 freq_G REAL,
1685 freq_U REAL, 1666 freq_U REAL,
1686 freq_other REAL, 1667 freq_other REAL,
1668 + gap_percent REAL,
1669 + consensus CHAR(1),
1687 PRIMARY KEY (rfam_acc, index_ali), 1670 PRIMARY KEY (rfam_acc, index_ali),
1688 FOREIGN KEY(rfam_acc) REFERENCES family(rfam_acc) 1671 FOREIGN KEY(rfam_acc) REFERENCES family(rfam_acc)
1689 ); 1672 );
...@@ -2158,7 +2141,7 @@ def work_prepare_sequences(dl, rfam_acc, chains): ...@@ -2158,7 +2141,7 @@ def work_prepare_sequences(dl, rfam_acc, chains):
2158 with open(fasta, 'w') as f: 2141 with open(fasta, 'w') as f:
2159 for rec in seqfile: 2142 for rec in seqfile:
2160 if rec.id not in doublons: 2143 if rec.id not in doublons:
2161 - f.write(rec.format("fasta")) 2144 + f.write(format(rec, "fasta"))
2162 2145
2163 # Add the new sequences with previous ones, if any 2146 # Add the new sequences with previous ones, if any
2164 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f: 2147 with open(path_to_seq_data + f"realigned/{rfam_acc}++.fa", "a") as f:
...@@ -2333,28 +2316,8 @@ def work_realign(rfam_acc): ...@@ -2333,28 +2316,8 @@ def work_realign(rfam_acc):
2333 er.write(f"Failed to realign {rfam_acc} (killed)") 2316 er.write(f"Failed to realign {rfam_acc} (killed)")
2334 2317
2335 2318
2336 -def summarize_position(counts):
2337 - """ Counts the number of nucleotides at a given position, given a "column" from a MSA.
2338 - """
2339 -
2340 - # Count modified nucleotides
2341 - chars = counts.keys()
2342 - known_chars_count = 0
2343 - N = 0
2344 - for char in chars:
2345 - if char in "ACGU":
2346 - known_chars_count += counts[char]
2347 - if char not in ".-":
2348 - N += counts[char] # number of ungapped residues
2349 -
2350 - if N: # prevent division by zero if the column is only gaps
2351 - return (counts['A']/N, counts['C']/N, counts['G']/N, counts['U']/N, (N - known_chars_count)/N) # other residues, or consensus (N, K, Y...)
2352 - else:
2353 - return (0, 0, 0, 0, 0)
2354 -
2355 -
2356 @trace_unhandled_exceptions 2319 @trace_unhandled_exceptions
2357 -def work_pssm_remap(f, fill_gaps): 2320 +def work_pssm_remap(f):
2358 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family. 2321 """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
2359 This also remaps the 3D object sequence with the aligned sequence in the MSA. 2322 This also remaps the 3D object sequence with the aligned sequence in the MSA.
2360 If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing. 2323 If asked, the 3D object sequence is completed by the consensus nucleotide when one of them is missing.
...@@ -2385,11 +2348,54 @@ def work_pssm_remap(f, fill_gaps): ...@@ -2385,11 +2348,54 @@ def work_pssm_remap(f, fill_gaps):
2385 with open(runDir + "/errors.txt", "a") as errf: 2348 with open(runDir + "/errors.txt", "a") as errf:
2386 errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n") 2349 errf.write(f"{f}'s alignment is wrong. Recompute it and retry.\n")
2387 return 1 2350 return 1
2351 + nseqs = len(align)
2352 + ncols = align.get_alignment_length()
2388 2353
2389 # Compute statistics per column 2354 # Compute statistics per column
2390 - pssm = BufferingSummaryInfo(align).get_pssm(f, thr_idx) 2355 + pssm_info = np.zeros((6, ncols))
2391 - frequencies = [ summarize_position(pssm[i]) for i in range(align.get_alignment_length()) ] 2356 + res_index = {'A':0, 'C':1, 'G':2, 'U':3, 'N':4, '-':5}
2392 - del pssm 2357 + letters = "ACGUN"
2358 + consensus = []
2359 +
2360 + for residue_num in tqdm(range(ncols), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Count bases in fam {f}", leave=False):
2361 +
2362 + # Count the bases (iterate lines)
2363 + for record in align:
2364 + letter = record.seq[residue_num].upper().replace('.','-')
2365 + try:
2366 + idx = res_index[letter]
2367 + except KeyError:
2368 + # warn(f"Unknown residue found in {family} family: {letter}", error=True)
2369 + # These are K, R, etc from Rfam. The RNANet sequences provided are pure ACGUN, but not the Rfam ones.
2370 + idx = 4 # consider it is N
2371 + pssm_info[idx,residue_num] += 1.0
2372 +
2373 + # Get the number of non-gap nucleotides
2374 + N = 0
2375 + for i in range(5):
2376 + N += pssm_info[i,residue_num]
2377 +
2378 + if N>0:
2379 + # Divide base counts by number of non-gaps
2380 + for i in range(5):
2381 + pssm_info[i,residue_num] /= N
2382 +
2383 + # last line is for the gap percentage (Ngaps/Nlines)
2384 + pssm_info[5,residue_num] /= nseqs
2385 +
2386 + # Define consensus base for this position:
2387 + if pssm_info[5,residue_num] > 0.7:
2388 + # gaps are in majority if over 75% (that's my definition)
2389 + consensus.append('-')
2390 + else:
2391 + idx = np.argmax(pssm_info[0:5,residue_num])
2392 + if pssm_info[idx, residue_num] > 0.5:
2393 + consensus.append(letters[idx])
2394 + else:
2395 + consensus.append('N')
2396 +
2397 + # At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars.
2398 +
2393 2399
2394 ########################################################################################## 2400 ##########################################################################################
2395 # Remap sequences of the 3D chains with sequences in the alignment 2401 # Remap sequences of the 3D chains with sequences in the alignment
...@@ -2397,60 +2403,51 @@ def work_pssm_remap(f, fill_gaps): ...@@ -2397,60 +2403,51 @@ def work_pssm_remap(f, fill_gaps):
2397 2403
2398 setproctitle(f"RNAnet.py work_pssm_remap({f}) remap") 2404 setproctitle(f"RNAnet.py work_pssm_remap({f}) remap")
2399 2405
2400 - # For each sequence, find the right chain and remap chain residues with alignment columns 2406 + # For each sequence, remap chain residues with sequence alignment
2401 columns_to_save = set() 2407 columns_to_save = set()
2402 re_mappings = [] 2408 re_mappings = []
2403 - alilen = align.get_alignment_length() 2409 + pbar = tqdm(total=nseqs, position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False)
2404 - pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Remap {f} chains", leave=False)
2405 pbar.update(0) 2410 pbar.update(0)
2406 for s in align: 2411 for s in align:
2407 - if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain 2412 + # skip Rfamseq entries
2413 + if not '[' in s.id:
2408 continue 2414 continue
2409 2415
2410 - # Check if the chain existed before in the database 2416 + # Get the chain id in the database
2411 - if s.id in chains_ids: 2417 + conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
2412 - # a chain object is found in the update, this sequence is new 2418 + conn.execute('pragma journal_mode=wal')
2413 - this_chain = list_of_chains[chains_ids.index(s.id)] 2419 + db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
2414 - seq_to_align = this_chain.seq_to_align 2420 + if len(db_id):
2415 - full_length = this_chain.full_length 2421 + db_id = db_id[0][0]
2416 - db_id = this_chain.db_chain_id
2417 else: 2422 else:
2418 - # it existed in the database before.
2419 - # Get the chain id in the database
2420 - conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0)
2421 - conn.execute('pragma journal_mode=wal')
2422 - db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
2423 - if len(db_id):
2424 - db_id = db_id[0][0]
2425 - else:
2426 - conn.close()
2427 - warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...")
2428 - pbar.update(1)
2429 - continue
2430 - seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
2431 - full_length = len(seq_to_align)
2432 conn.close() 2423 conn.close()
2424 + warn(f"Bizarre... sequence {s.id} is not found in the database ! Cannot remap it ! Ignoring...")
2425 + pbar.update(1)
2426 + continue
2427 + seq_to_align = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;")])
2428 + full_length = len(seq_to_align)
2429 + conn.close()
2433 2430
2434 # Save colums in the appropriate positions 2431 # Save colums in the appropriate positions
2435 i = 0 # to iterate the object sequence 2432 i = 0 # to iterate the object sequence
2436 j = 0 # to iterate the alignment sequence 2433 j = 0 # to iterate the alignment sequence
2437 - while i < full_length and j < alilen: 2434 + while i < full_length and j < ncols:
2438 # Here we try to map seq_to_align (the sequence of the 3D chain, including gaps when residues are missing), 2435 # Here we try to map seq_to_align (the sequence of the 3D chain, including gaps when residues are missing),
2439 # with s.seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and . 2436 # with s.seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
2440 2437
2441 - if seq_to_align[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps) 2438 + if seq_to_align[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps)
2442 re_mappings.append((db_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1. 2439 re_mappings.append((db_id, i+1, j+1)) # because index_chain in table nucleotide is in [1,N], we use i+1 and j+1.
2443 columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored 2440 columns_to_save.add(j+1) # it's a set, doublons are automaticaly ignored
2444 i += 1 2441 i += 1
2445 j += 1 2442 j += 1
2446 - elif seq_to_align[i] == '-': # gap in the chain, but not in the aligned sequence 2443 + elif seq_to_align[i] == '-': # '-' in the chain, but '.' or letter in the aligned sequence
2447 # search for a gap to the consensus nearby 2444 # search for a gap to the consensus nearby
2448 k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.' 2445 k = 0 # Search must start at zero to assert the difference comes from '-' in front of '.'
2449 - while j+k < alilen and s.seq[j+k] == '.': 2446 + while j+k < ncols and s.seq[j+k] == '.':
2450 k += 1 2447 k += 1
2451 2448
2452 # if found, set j to that position 2449 # if found, set j to that position
2453 - if j+k < alilen and s.seq[j+k] == '-': 2450 + if j+k < ncols and s.seq[j+k] == '-':
2454 re_mappings.append((db_id, i+1, j+k+1)) 2451 re_mappings.append((db_id, i+1, j+k+1))
2455 columns_to_save.add(j+k+1) 2452 columns_to_save.add(j+k+1)
2456 i += 1 2453 i += 1
...@@ -2458,31 +2455,28 @@ def work_pssm_remap(f, fill_gaps): ...@@ -2458,31 +2455,28 @@ def work_pssm_remap(f, fill_gaps):
2458 continue 2455 continue
2459 2456
2460 # if not, take the insertion gap if this is one 2457 # if not, take the insertion gap if this is one
2461 - if j < alilen and s.seq[j] == '.': 2458 + if j < ncols and s.seq[j] == '.':
2462 re_mappings.append((db_id, i+1, j+1)) 2459 re_mappings.append((db_id, i+1, j+1))
2463 columns_to_save.add(j+1) 2460 columns_to_save.add(j+1)
2464 i += 1 2461 i += 1
2465 j += 1 2462 j += 1
2466 continue 2463 continue
2467 2464
2468 - # else, just mark the gap as unknown (there is an alignment mismatch) 2465 + # else, just mark the gap as unknown (there is an alignment mismatch '-' in the 3D facing a letter in the alignment)
2469 re_mappings.append((db_id, i+1, 0)) 2466 re_mappings.append((db_id, i+1, 0))
2470 i += 1 2467 i += 1
2471 elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain 2468 elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
2472 j += 1 # ignore the column 2469 j += 1 # ignore the column
2473 else: # sequence mismatch which is not a gap... 2470 else: # sequence mismatch which is not a gap...
2474 - print(f"You are never supposed to reach this. Comparing {self.chain_label} in {i} ({self.seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).", 2471 + print(f"You are never supposed to reach this. Comparing {s.id} in {i} ({seq_to_align[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).",
2475 - self.seq_to_align, s.seq, sep='\n', flush=True) 2472 + seq_to_align, s.seq, sep='\n', flush=True)
2476 raise Exception('Something is wrong with sequence alignment.') 2473 raise Exception('Something is wrong with sequence alignment.')
2477 2474
2478 pbar.update(1) 2475 pbar.update(1)
2479 pbar.close() 2476 pbar.close()
2480 2477
2481 - # Check we found something 2478 + # Get a sorted list from the set
2482 - if not len(re_mappings): 2479 + columns = sorted(columns_to_save)
2483 - warn(f"Chains were not found in {f}++.afa file: {chains_ids}", error=True)
2484 - return 1
2485 -
2486 2480
2487 ########################################################################################## 2481 ##########################################################################################
2488 # Save the alignment columns and their mappings to the database 2482 # Save the alignment columns and their mappings to the database
...@@ -2505,75 +2499,48 @@ def work_pssm_remap(f, fill_gaps): ...@@ -2505,75 +2499,48 @@ def work_pssm_remap(f, fill_gaps):
2505 if col not in columns_to_save: 2499 if col not in columns_to_save:
2506 unused.append((f, col)) 2500 unused.append((f, col))
2507 sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused) 2501 sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
2502 + conn.commit()
2503 +
2508 # Save the useful columns in the database 2504 # Save the useful columns in the database
2509 - data = [(f, j) + frequencies[j-1] for j in sorted(columns_to_save)] 2505 + data = [(f, j) + tuple(pssm_info[:,j-1]) + (consensus[j-1],) for j in sorted(columns_to_save)]
2510 - sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) 2506 + sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus)
2511 - VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 2507 + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO
2512 - UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, freq_other=excluded.freq_other;""", many=True, data=data) 2508 + UPDATE SET freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U,
2513 - # Add an unknown values column, with index_ali 0 2509 + freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus;""", many=True, data=data)
2514 - sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other) 2510 + # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
2515 - VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 1.0);""", data=(f,)) 2511 + sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus)
2512 + VALUES (?, 0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-');""", data=(f,))
2516 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains) 2513 # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
2517 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f)) 2514 sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
2518 conn.close() 2515 conn.close()
2519 2516
2520 ########################################################################################## 2517 ##########################################################################################
2521 - # Replacing gaps in the 3D chains by consensus sequences 2518 + # Saving the filtered alignement with only the saved positinos
2522 ########################################################################################## 2519 ##########################################################################################
2523 2520
2524 - setproctitle(f"RNAnet.py work_pssm_remap({f}) replace gaps") 2521 + setproctitle(f"RNAnet.py work_pssm_remap({f}) filtering alignment")
2525 2522
2526 - # Replace gaps by consensus 2523 + # filter the alignment
2527 - if fill_gaps: 2524 + names = [ x.id for x in align if '[' in x.id ]
2528 - pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: Replace {f} gaps", leave=False) 2525 + align = align[-len(names):]
2529 - pbar.update(0) 2526 + filtered_alignment = align[:, 1:1] # all the lines, but no columns
2530 - gaps = [] 2527 + for p in columns:
2531 - conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=10.0) 2528 + filtered_alignment += align[:, p-1:p] # save columns one by one
2532 - conn.execute('pragma journal_mode=wal')
2533 - for s in align:
2534 - if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain
2535 - continue
2536 -
2537 - db_id = sql_ask_database(conn, f"SELECT chain_id FROM chain WHERE structure_id = '{s.id.split('[')[0]}' AND chain_name = '{s.id.split('-')[1]}' AND rfam_acc = '{f}';")
2538 - if len(db_id):
2539 - db_id = db_id[0][0]
2540 - else:
2541 - pbar.update(1)
2542 - continue
2543 - seq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
2544 - aliseq = ''.join([ x[0] for x in sql_ask_database(conn, f"SELECT nt_align_code FROM nucleotide WHERE chain_id = {db_id} ORDER BY index_chain ASC;") ])
2545 - full_length = len(seq)
2546 -
2547 - # detect gaps
2548 - c_seq = list(seq) # contains "ACGUNacgu-"
2549 - letters = ['A', 'C', 'G', 'U', 'N']
2550 - homology_data = sql_ask_database(conn, f"""SELECT freq_A, freq_C, freq_G, freq_U, freq_other FROM
2551 - (SELECT chain_id, rfam_acc FROM chain WHERE chain_id={db_id})
2552 - NATURAL JOIN re_mapping
2553 - NATURAL JOIN align_column;
2554 - """)
2555 - if homology_data is None or not len(homology_data):
2556 - with open(runDir + "/errors.txt", "a") as errf:
2557 - errf.write(f"No homology data found in the database for {s.id} ! Not replacing gaps.\n")
2558 - continue
2559 - elif len(homology_data) != full_length:
2560 - with open(runDir + "/errors.txt", "a") as errf:
2561 - errf.write(f"Found {len(homology_data)} nucleotides for {s.id} of length {full_length} ! Not replacing gaps.\n")
2562 - continue
2563 - for i in range(full_length):
2564 - if c_seq[i] == '-':
2565 - freq = homology_data[i]
2566 - l = letters[freq.index(max(freq))]
2567 - gaps.append((l, l == 'A', l == 'C', l == 'G', l == 'U', l == 'N', db_id, i+1))
2568 - pbar.update(1)
2569 - sql_execute(conn, f"""UPDATE nucleotide SET nt_align_code = ?,
2570 - is_A = ?, is_C = ?, is_G = ?, is_U = ?, is_other = ?
2571 - WHERE chain_id = ? AND index_chain = ?;""", many=True, data=gaps)
2572 - conn.close()
2573 - idxQueue.put(thr_idx) # replace the thread index in the queue
2574 2529
2575 - setproctitle(f"RNAnet.py work_pssm_remap({f}) finished") 2530 + # write it to file in both STK and FASTA formats (STK required for distance matrices in statistics)
2531 + with open(path_to_seq_data+f"/realigned/{f}_3d_only.stk", "w") as only_3d:
2532 + try:
2533 + only_3d.write(format(filtered_alignment, "stockholm"))
2534 + except ValueError as e:
2535 + warn(e)
2536 + with open(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "w") as only_3d:
2537 + try:
2538 + only_3d.write(format(filtered_alignment, "fasta"))
2539 + except ValueError as e:
2540 + warn(e)
2576 2541
2542 + setproctitle(f"RNAnet.py work_pssm_remap({f}) finished")
2543 + idxQueue.put(thr_idx) # replace the thread index in the queue
2577 return 0 2544 return 0
2578 2545
2579 2546
...@@ -2587,7 +2554,7 @@ def work_save(c, homology=True): ...@@ -2587,7 +2554,7 @@ def work_save(c, homology=True):
2587 if homology: 2554 if homology:
2588 df = pd.read_sql_query(f""" 2555 df = pd.read_sql_query(f"""
2589 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, 2556 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code,
2590 - is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn, 2557 + is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, dbn,
2591 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, 2558 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
2592 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, 2559 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
2593 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM 2560 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM
...@@ -2631,9 +2598,9 @@ if __name__ == "__main__": ...@@ -2631,9 +2598,9 @@ if __name__ == "__main__":
2631 sql_define_tables(conn) 2598 sql_define_tables(conn)
2632 print("> Storing results into", runDir + "/results/RNANet.db") 2599 print("> Storing results into", runDir + "/results/RNANet.db")
2633 2600
2634 - # # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc). 2601 + # compute an update compared to what is in the table "chain" (comparison on structure_id + chain_name + rfam_acc).
2635 - # # If --all was passed, all the structures are kept. 2602 + # If --all was passed, all the structures are kept.
2636 - # # Fills pp.update with Chain() objects. 2603 + # Fills pp.update with Chain() objects.
2637 # pp.list_available_mappings() 2604 # pp.list_available_mappings()
2638 2605
2639 # =========================================================================== 2606 # ===========================================================================
...@@ -2642,7 +2609,8 @@ if __name__ == "__main__": ...@@ -2642,7 +2609,8 @@ if __name__ == "__main__":
2642 2609
2643 # # Download and annotate new RNA 3D chains (Chain objects in pp.update) 2610 # # Download and annotate new RNA 3D chains (Chain objects in pp.update)
2644 # # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed. 2611 # # If the original cif file and/or the Json DSSR annotation file already exist, they are not redownloaded/recomputed.
2645 - # pp.dl_and_annotate(coeff_ncores=0.5) 2612 + # # pp.dl_and_annotate(coeff_ncores=0.5)
2613 + # pp.dl_and_annotate(coeff_ncores=1.0)
2646 # print("Here we go.") 2614 # print("Here we go.")
2647 2615
2648 # # At this point, the structure table is up to date. 2616 # # At this point, the structure table is up to date.
...@@ -2710,7 +2678,7 @@ if __name__ == "__main__": ...@@ -2710,7 +2678,7 @@ if __name__ == "__main__":
2710 # Prepare the results 2678 # Prepare the results
2711 # ========================================================================================== 2679 # ==========================================================================================
2712 2680
2713 - pp.sanitize_database() 2681 + # pp.sanitize_database()
2714 pp.output_results() 2682 pp.output_results()
2715 2683
2716 print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove 2684 print("Completed.") # This part of the code is supposed to release some serotonin in the modeller's brain, do not remove
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
4 # Run this file if you want the base counts, pair-type counts, identity percents, etc 4 # Run this file if you want the base counts, pair-type counts, identity percents, etc
5 # in the database. 5 # in the database.
6 6
7 -import getopt, os, pickle, sqlite3, shlex, subprocess, sys 7 +import getopt, os, pickle, sqlite3, shlex, subprocess, sys, warnings
8 import numpy as np 8 import numpy as np
9 import pandas as pd 9 import pandas as pd
10 import threading as th 10 import threading as th
...@@ -16,6 +16,7 @@ import scipy.cluster.hierarchy as sch ...@@ -16,6 +16,7 @@ import scipy.cluster.hierarchy as sch
16 from scipy.spatial.distance import squareform 16 from scipy.spatial.distance import squareform
17 from mpl_toolkits.mplot3d import axes3d 17 from mpl_toolkits.mplot3d import axes3d
18 from Bio import AlignIO, SeqIO 18 from Bio import AlignIO, SeqIO
19 +from Bio.PDB.MMCIFParser import MMCIFParser
19 from functools import partial 20 from functools import partial
20 from multiprocessing import Pool, Manager 21 from multiprocessing import Pool, Manager
21 from os import path 22 from os import path
...@@ -429,11 +430,7 @@ def parallel_stats_pairs(f): ...@@ -429,11 +430,7 @@ def parallel_stats_pairs(f):
429 @trace_unhandled_exceptions 430 @trace_unhandled_exceptions
430 def to_id_matrix(f): 431 def to_id_matrix(f):
431 """ 432 """
432 - Extracts sequences of 3D chains from the family alignments to a distinct STK file, 433 + Runs esl-alipid on the filtered alignment to get an identity matrix.
433 - then runs esl-alipid on it to get an identity matrix.
434 -
435 - Side-effect : also produces the 3D_only family alignment as a separate file.
436 - So, we use this function to update 'ali_filtered_length' in the family table.
437 """ 434 """
438 if path.isfile("data/"+f+".npy"): 435 if path.isfile("data/"+f+".npy"):
439 return 0 436 return 0
...@@ -444,35 +441,18 @@ def to_id_matrix(f): ...@@ -444,35 +441,18 @@ def to_id_matrix(f):
444 441
445 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_id_matrix({f})") 442 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} to_id_matrix({f})")
446 443
447 - # Prepare a file 444 + if not path.isfile(f"{path_to_seq_data}/realigned/{f}_3d_only.stk"):
448 - with open(path_to_seq_data+f"/realigned/{f}++.afa") as al_file: 445 + warn(f"File not found: {path_to_seq_data}/realigned/{f}_3d_only.stk")
449 - al = AlignIO.read(al_file, "fasta") 446 + align = AlignIO.read(f"{path_to_seq_data}/realigned/{f}_3d_only.stk", "stockholm")
450 - names = [ x.id for x in al if '[' in x.id ] 447 + names = [ x.id for x in align if '[' in x.id ]
451 - al = al[-len(names):]
452 - with open(path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk", "w") as only_3d:
453 - try:
454 - only_3d.write(al.format("stockholm"))
455 - except ValueError as e:
456 - warn(e)
457 - del al
458 - subprocess.run(["esl-reformat", "--informat", "stockholm", "--mingap", #
459 - "-o", path_to_seq_data+f"/realigned/{f}_3d_only.stk", # This run just deletes columns of gaps
460 - "stockholm", path_to_seq_data+f"/realigned/{f}_3d_only_tmp.stk"]) #
461 - subprocess.run(["rm", "-f", f + "_3d_only_tmp.stk", f + "_3d_only.stk"])
462 - subprocess.run(["esl-reformat", "-o", path_to_seq_data+f"/realigned/{f}_3d_only.afa", "afa", path_to_seq_data+f"/realigned/{f}_3d_only.stk"])
463 -
464 - # Out-of-scope task : update the database with the length of the filtered alignment:
465 - align = AlignIO.read(path_to_seq_data+f"/realigned/{f}_3d_only.afa", "fasta")
466 - with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
467 - conn.execute('pragma journal_mode=wal')
468 - sql_execute(conn, "UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=[align.get_alignment_length(), f])
469 del align 448 del align
470 - 449 +
450 + pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False)
451 + pbar.update(0)
452 +
471 # Prepare the job 453 # Prepare the job
472 - process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}realigned/{f}_3d_only.stk"), 454 + process = subprocess.Popen(shlex.split(f"esl-alipid --rna --noheader --informat stockholm {path_to_seq_data}/realigned/{f}_3d_only.stk"), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
473 - stdout=subprocess.PIPE, stderr=subprocess.PIPE)
474 id_matrix = np.zeros((len(names), len(names))) 455 id_matrix = np.zeros((len(names), len(names)))
475 - pbar = tqdm(total = len(names)*(len(names)-1)*0.5, position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} idty matrix", unit="comparisons", leave=False)
476 cnt = 0 456 cnt = 0
477 while not cnt or process.poll() is None: 457 while not cnt or process.poll() is None:
478 output = process.stdout.read() 458 output = process.stdout.read()
...@@ -632,7 +612,6 @@ def stats_pairs(): ...@@ -632,7 +612,6 @@ def stats_pairs():
632 plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99) 612 plt.subplots_adjust(left=0.1, bottom=0.16, top=0.95, right=0.99)
633 plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png") 613 plt.savefig(runDir + f"/results/figures/pairings_{res_thr}.png")
634 614
635 - setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
636 notify("Computed nucleotide statistics and saved CSV and PNG file.") 615 notify("Computed nucleotide statistics and saved CSV and PNG file.")
637 616
638 @trace_unhandled_exceptions 617 @trace_unhandled_exceptions
...@@ -931,7 +910,141 @@ def general_stats(): ...@@ -931,7 +910,141 @@ def general_stats():
931 hspace=0.05, bottom=0.12, top=0.84) 910 hspace=0.05, bottom=0.12, top=0.84)
932 fig.savefig(runDir + "/results/figures/Nfamilies.png") 911 fig.savefig(runDir + "/results/figures/Nfamilies.png")
933 plt.close() 912 plt.close()
913 +
914 +def get_matrix_euclidian_distance(cif_file, aligned_seq, consider_all_atoms):
915 + """
916 + This function
917 + - loads the coordinates and the alignment, reconctructs the alignment but with coordinates, considering gaps, and
918 + - compute the matrix of euclidian distances.
919 +
920 + Returns:
921 + The 2D np.array of euclidian distances between pairs of nucleotides, with np.NaNs in gap columns.
922 + """
923 + # Load the baricenter coordinates
924 + coordinates = nt_3d_centers(cif_file, consider_all_atoms)
925 +
926 + # reconstruct the alignment but with coordinates
927 + nb_gap = 0
928 + coordinates_with_gaps = []
929 + for i in range(len(aligned_seq)):
930 + if aligned_seq[i] == '.' or aligned_seq[i] == '-':
931 + nb_gap = nb_gap + 1
932 + coordinates_with_gaps.append('NA')
933 + else:
934 + coordinates_with_gaps.append(coordinates[i - nb_gap])
935 +
936 + nb_nucleotides = len(coordinates_with_gaps) # number of nucleotides
937 + matrix = np.zeros((nb_nucleotides, nb_nucleotides)) # create a new empty matrix of size nxn
938 +
939 + # Fill this new matrix with the euclidians distances between all amino acids considering gaps:
940 + for i in range(nb_nucleotides):
941 + for j in range(nb_nucleotides):
942 + if coordinates_with_gaps[i] == 'NA' or coordinates_with_gaps[j] == 'NA':
943 + matrix[i][j] = np.nan
944 + else:
945 + matrix[i][j] = round(get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j]),3)
946 + return(matrix)
947 +
948 +@trace_unhandled_exceptions
949 +def get_avg_std_distance_matrix(f, consider_all_atoms):
950 + # Get a worker number to position the progress bar
951 + global idxQueue
952 + thr_idx = idxQueue.get()
953 +
954 + setproctitle(f"RNANet statistics.py Worker {thr_idx+1} {f} residue distance matrices")
955 +
956 + if consider_all_atoms:
957 + label = "base"
958 + else:
959 + label = "backbone"
960 +
961 + os.makedirs(runDir + '/results/distance_matrices/' + f + '_' + label, exist_ok=True )
962 +
963 +
964 + family_matrices = []
965 + align = AlignIO.read(path_to_seq_data + f"realigned/{f}_3d_only.afa", "fasta")
966 + found = 0
967 + notfound = 0
968 + pbar = tqdm(total = len(align), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} {label} distance matrices", unit="chains", leave=False)
969 + pbar.update(0)
970 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
971 + conn.execute('pragma journal_mode=wal')
972 + r = sql_ask_database(conn, f"SELECT structure_id, '_1_', chain_name, '_', CAST(pdb_start AS TEXT), '-', CAST(pdb_end AS TEXT) FROM chain WHERE rfam_acc='{f}';")
973 + filelist = [ ''.join(list(x))+'.cif' for x in r ]
974 +
975 + for s in align:
976 + filename = ''
977 + for file in filelist:
978 + if file.startswith(s.id.replace('-', '').replace('[', '_').replace(']', '_')):
979 + filename = path_to_3D_data + "rna_mapped_to_Rfam/" + file
980 + break
981 + if len(filename):
982 + found += 1
983 + try:
984 + euclidian_distance = get_matrix_euclidian_distance(filename, s.seq, consider_all_atoms)
985 + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', euclidian_distance, delimiter=",", fmt="%.3f")
986 + family_matrices.append(euclidian_distance)
987 + except FileNotFoundError:
988 + found -= 1
989 + notfound += 1
990 + else:
991 + notfound += 1
992 + pbar.update(1)
993 +
994 + # Calculation of the average matrix
995 + avgarray = np.array(family_matrices)
996 + if len(avgarray) == 0 or np.prod(avgarray.shape) == 0:
997 + warn(f"Something's wrong with the shapes: {avgarray.shape}", error=True)
998 + with warnings.catch_warnings():
999 + warnings.simplefilter("ignore", category=RuntimeWarning)
1000 + matrix_average_distances = np.nanmean(avgarray, axis=0 )
1001 +
1002 + if len(matrix_average_distances) != 0:
1003 + matrix_average_distances = np.nan_to_num(matrix_average_distances)
1004 + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.csv' , np.triu(matrix_average_distances), delimiter=",", fmt="%.3f")
1005 +
1006 + fig, ax = plt.subplots()
1007 + im = ax.imshow(matrix_average_distances)
1008 + cbar = ax.figure.colorbar(im, ax=ax)
1009 + cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
1010 + ax.set_title("Average distance between residues (Angströms)")
1011 + fig.tight_layout()
1012 + fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_average.png', dpi=300)
1013 + plt.close()
1014 +
1015 + # Calculation of the standard deviation matrix
1016 + with warnings.catch_warnings():
1017 + warnings.simplefilter("ignore", category=RuntimeWarning)
1018 + matrix_standard_deviation_distances = np.nanstd(avgarray, axis=0 )
1019 +
1020 + if len(matrix_standard_deviation_distances) != 0:
1021 + matrix_standard_deviation_distances = np.nan_to_num(matrix_standard_deviation_distances)
1022 + np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_stdev.csv' , np.triu(matrix_standard_deviation_distances), delimiter=",", fmt="%.3f")
1023 +
1024 + fig, ax = plt.subplots()
1025 + im = ax.imshow(matrix_standard_deviation_distances)
1026 + cbar = ax.figure.colorbar(im, ax=ax)
1027 + cbar.ax.set_ylabel("Angströms", rotation=-90, va="bottom")
1028 + ax.set_title("Average distance between residues (Angströms)")
1029 + fig.tight_layout()
1030 + fig.savefig(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_std.png', dpi=300)
1031 + plt.close()
1032 +
1033 + # Save log
1034 + with open(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '.log', 'a') as logfile:
1035 + logfile.write(str(found)+ " chains taken into account for computation. "+ str(notfound)+ " were not found.\n")
1036 +
1037 + # Save associated nucleotide frequencies (off-topic but convenient to do it here)
1038 + with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
1039 + conn.execute('pragma journal_mode=wal')
1040 + df = pd.read_sql_query(f"SELECT freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus FROM align_column WHERE rfam_acc = '{f}' AND index_ali > 0 ORDER BY index_ali ASC;", conn)
1041 + df.to_csv(runDir + '/results/distance_matrices/' + f + '_'+ label + '/' + f + '_frequencies.csv', float_format="%.3f")
1042 +
1043 + pbar.close()
1044 +
1045 + idxQueue.put(thr_idx) # replace the thread index in the queue
934 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished") 1046 setproctitle(f"RNANet statistics.py Worker {thr_idx+1} finished")
1047 + return 0
935 1048
936 def log_to_pbar(pbar): 1049 def log_to_pbar(pbar):
937 def update(r): 1050 def update(r):
...@@ -952,6 +1065,48 @@ def family_order(f): ...@@ -952,6 +1065,48 @@ def family_order(f):
952 else: 1065 else:
953 return 2 1066 return 2
954 1067
1068 +def nt_3d_centers(cif_file, consider_all_atoms):
1069 + """Return the nucleotides' coordinates, summarizing a nucleotide by only one point.
1070 + If consider_all_atoms : barycentre is used
1071 + else: C1' atom is the nucleotide
1072 + """
1073 + result =[]
1074 + structure = MMCIFParser().get_structure(cif_file, cif_file)
1075 +
1076 + if consider_all_atoms == True:
1077 + for model in structure:
1078 + for chain in model:
1079 + for residue in chain:
1080 + temp_list = []
1081 + res_isobaricentre = 0
1082 + for atom in residue:
1083 + temp_list.append(atom.get_coord())
1084 + lg = len(temp_list)
1085 +
1086 + summ = np.sum(temp_list, axis = 0)
1087 + res_isobaricentre = [summ[0]/lg, summ[1]/lg, summ[2]/lg]
1088 + result.append([res_isobaricentre[0], res_isobaricentre[1], res_isobaricentre[2]])
1089 +
1090 + elif consider_all_atoms == False:
1091 + for model in structure:
1092 + for chain in model:
1093 + for residue in chain:
1094 + for atom in residue:
1095 + if atom.get_name() == "C1'":
1096 + coordinates = atom.get_coord()
1097 + res = [coordinates[0], coordinates[1], coordinates[2]]
1098 + result.append(res)
1099 + return(result)
1100 +
1101 +def get_euclidian_distance(L1, L2):
1102 + """Returns the distance between two points (coordinates in lists)
1103 + """
1104 +
1105 + e = 0
1106 + for i in range(len(L1)):
1107 + e += float(L1[i] - L2[i])**2
1108 + return np.sqrt(e)
1109 +
955 if __name__ == "__main__": 1110 if __name__ == "__main__":
956 1111
957 os.makedirs(runDir + "/results/figures/", exist_ok=True) 1112 os.makedirs(runDir + "/results/figures/", exist_ok=True)
...@@ -959,8 +1114,9 @@ if __name__ == "__main__": ...@@ -959,8 +1114,9 @@ if __name__ == "__main__":
959 # parse options 1114 # parse options
960 DELETE_OLD_DATA = False 1115 DELETE_OLD_DATA = False
961 DO_WADLEY_ANALYSIS = False 1116 DO_WADLEY_ANALYSIS = False
1117 + DO_AVG_DISTANCE_MATRIX = False
962 try: 1118 try:
963 - opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "resolution=", "3d-folder=", "seq-folder=" ]) 1119 + opts, _ = getopt.getopt( sys.argv[1:], "r:h", [ "help", "from-scratch", "wadley", "distance-matrices", "resolution=", "3d-folder=", "seq-folder=" ])
964 except getopt.GetoptError as err: 1120 except getopt.GetoptError as err:
965 print(err) 1121 print(err)
966 sys.exit(2) 1122 sys.exit(2)
...@@ -979,9 +1135,12 @@ if __name__ == "__main__": ...@@ -979,9 +1135,12 @@ if __name__ == "__main__":
979 print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:" 1135 print("--seq-folder=…\t\t\tPath to a folder containing the sequence and alignment files. Required subfolder:"
980 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family") 1136 "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
981 print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything") 1137 print("--from-scratch\t\t\tDo not use precomputed results from past runs, recompute everything")
1138 + print("--distance-matrices\t\tCompute average distance between nucleotide pairs for each family.")
1139 + print("--wadley\t\t\tReproduce Wadley & al 2007 clustering of pseudotorsions.")
1140 +
982 sys.exit() 1141 sys.exit()
983 elif opt == '--version': 1142 elif opt == '--version':
984 - print("RNANet statistics 1.2") 1143 + print("RNANet statistics 1.3 beta")
985 sys.exit() 1144 sys.exit()
986 elif opt == "-r" or opt == "--resolution": 1145 elif opt == "-r" or opt == "--resolution":
987 assert float(arg) > 0.0 and float(arg) <= 20.0 1146 assert float(arg) > 0.0 and float(arg) <= 20.0
...@@ -997,6 +1156,8 @@ if __name__ == "__main__": ...@@ -997,6 +1156,8 @@ if __name__ == "__main__":
997 elif opt=='--from-scratch': 1156 elif opt=='--from-scratch':
998 DELETE_OLD_DATA = True 1157 DELETE_OLD_DATA = True
999 DO_WADLEY_ANALYSIS = True 1158 DO_WADLEY_ANALYSIS = True
1159 + elif opt=="--distance-matrices":
1160 + DO_AVG_DISTANCE_MATRIX = True
1000 elif opt=='--wadley': 1161 elif opt=='--wadley':
1001 DO_WADLEY_ANALYSIS = True 1162 DO_WADLEY_ANALYSIS = True
1002 1163
...@@ -1030,6 +1191,8 @@ if __name__ == "__main__": ...@@ -1030,6 +1191,8 @@ if __name__ == "__main__":
1030 subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"]) 1191 subprocess.run(["rm","-f", runDir + f"/data/{f}.npy", runDir + f"/data/{f}_pairs.csv", runDir + f"/data/{f}_counts.csv"])
1031 if DO_WADLEY_ANALYSIS: 1192 if DO_WADLEY_ANALYSIS:
1032 subprocess.run(["rm","-f", runDir + f"/data/wadley_kernel_eta_{res_thr}.npz", runDir + f"/data/wadley_kernel_eta_prime_{res_thr}.npz", runDir + f"/data/pair_counts_{res_thr}.csv"]) 1193 subprocess.run(["rm","-f", runDir + f"/data/wadley_kernel_eta_{res_thr}.npz", runDir + f"/data/wadley_kernel_eta_prime_{res_thr}.npz", runDir + f"/data/pair_counts_{res_thr}.csv"])
1194 + if DO_AVG_DISTANCE_MATRIX:
1195 + subprocess.run(["rm", "-rf", runDir + f"/results/distance_matrices/"])
1033 1196
1034 # Prepare the multiprocessing execution environment 1197 # Prepare the multiprocessing execution environment
1035 nworkers = min(read_cpu_number()-1, 32) 1198 nworkers = min(read_cpu_number()-1, 32)
...@@ -1043,6 +1206,17 @@ if __name__ == "__main__": ...@@ -1043,6 +1206,17 @@ if __name__ == "__main__":
1043 if n_unmapped_chains and DO_WADLEY_ANALYSIS: 1206 if n_unmapped_chains and DO_WADLEY_ANALYSIS:
1044 joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr))) 1207 joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
1045 joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr))) 1208 joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
1209 + if DO_AVG_DISTANCE_MATRIX:
1210 + extracted_chains = []
1211 + for file in os.listdir(path_to_3D_data + "rna_mapped_to_Rfam"):
1212 + if os.path.isfile(os.path.join(path_to_3D_data + "rna_mapped_to_Rfam", file)):
1213 + e1 = file.split('_')[0]
1214 + e2 = file.split('_')[1]
1215 + e3 = file.split('_')[2]
1216 + extracted_chains.append(e1 + '[' + e2 + ']' + '-' + e3)
1217 + for f in famlist:
1218 + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, True)))
1219 + joblist.append(Job(function=get_avg_std_distance_matrix, args=(f, False)))
1046 joblist.append(Job(function=stats_len)) # Computes figures 1220 joblist.append(Job(function=stats_len)) # Computes figures
1047 joblist.append(Job(function=stats_freq)) # updates the database 1221 joblist.append(Job(function=stats_freq)) # updates the database
1048 for f in famlist: 1222 for f in famlist:
......