Aglaé TABOT

premier commit avec des print

Showing 1 changed file with 28 additions and 18 deletions
...@@ -293,7 +293,7 @@ class Chain: ...@@ -293,7 +293,7 @@ class Chain:
293 "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base", 293 "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base",
294 "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"] 294 "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"]
295 df = df[cols_we_keep] 295 df = df[cols_we_keep]
296 - 296 + #print(df.iloc[0,:])
297 except KeyError as e: 297 except KeyError as e:
298 warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) 298 warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True)
299 self.delete_me = True 299 self.delete_me = True
...@@ -358,7 +358,7 @@ class Chain: ...@@ -358,7 +358,7 @@ class Chain:
358 self.delete_me = True 358 self.delete_me = True
359 self.error_messages = f"Error with parsing of duplicate residues numbers." 359 self.error_messages = f"Error with parsing of duplicate residues numbers."
360 return None 360 return None
361 - 361 + #print(df.iloc[0,:])
362 # Search for ligands at the end of the selection 362 # Search for ligands at the end of the selection
363 # Drop ligands detected as residues by DSSR, by detecting several markers 363 # Drop ligands detected as residues by DSSR, by detecting several markers
364 while ( 364 while (
...@@ -376,7 +376,7 @@ class Chain: ...@@ -376,7 +376,7 @@ class Chain:
376 self.mapping.log("Droping ligand:") 376 self.mapping.log("Droping ligand:")
377 self.mapping.log(df.tail(1)) 377 self.mapping.log(df.tail(1))
378 df = df.head(-1) 378 df = df.head(-1)
379 - 379 + #print(df.iloc[0,:])
380 # Duplicates in index_chain : drop, they are ligands 380 # Duplicates in index_chain : drop, they are ligands
381 # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) 381 # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601)
382 duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] 382 duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ]
...@@ -386,7 +386,7 @@ class Chain: ...@@ -386,7 +386,7 @@ class Chain:
386 if self.mapping is not None: 386 if self.mapping is not None:
387 self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.") 387 self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.")
388 df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain 388 df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain
389 - 389 + #print(df.iloc[0,:])
390 # drop eventual nts with index_chain < the first residue, 390 # drop eventual nts with index_chain < the first residue,
391 # now negative because we renumber to 1 (usually, ligands) 391 # now negative because we renumber to 1 (usually, ligands)
392 ligands = df[df.index_chain < 0] 392 ligands = df[df.index_chain < 0]
...@@ -396,7 +396,7 @@ class Chain: ...@@ -396,7 +396,7 @@ class Chain:
396 self.mapping.log("Droping ligand:") 396 self.mapping.log("Droping ligand:")
397 self.mapping.log(line) 397 self.mapping.log(line)
398 df = df.drop(ligands.index) 398 df = df.drop(ligands.index)
399 - 399 + #print(df.iloc[0,:])
400 # Find missing index_chain values 400 # Find missing index_chain values
401 # This happens because of resolved nucleotides that have a 401 # This happens because of resolved nucleotides that have a
402 # strange nt_resnum value. Thanks, biologists ! :@ :( 402 # strange nt_resnum value. Thanks, biologists ! :@ :(
...@@ -422,7 +422,7 @@ class Chain: ...@@ -422,7 +422,7 @@ class Chain:
422 df.iloc[i+1:, 1] += 1 422 df.iloc[i+1:, 1] += 1
423 else: 423 else:
424 warn(f"Missing index_chain {i} in {self.chain_label} !") 424 warn(f"Missing index_chain {i} in {self.chain_label} !")
425 - 425 + #print(df.iloc[0,:])
426 # Assert some nucleotides still exist 426 # Assert some nucleotides still exist
427 try: 427 try:
428 # update length of chain from nt_resnum point of view 428 # update length of chain from nt_resnum point of view
...@@ -452,12 +452,13 @@ class Chain: ...@@ -452,12 +452,13 @@ class Chain:
452 # index_chain 1 |-------------|77 83|------------| 154 452 # index_chain 1 |-------------|77 83|------------| 154
453 # expected data point 1 |--------------------------------| 154 453 # expected data point 1 |--------------------------------| 154
454 # 454 #
455 - 455 + #print(df[['index_chain', 'nt_resnum', 'nt_id', 'nt_code']])
456 if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l 456 if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l
457 resnum_start = df.iloc[0, 1] 457 resnum_start = df.iloc[0, 1]
458 # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start) 458 # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start)
459 diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) 459 diff = set(range(l)).difference(df['nt_resnum'] - resnum_start)
460 for i in sorted(diff): 460 for i in sorted(diff):
461 + #print(i)
461 # Add a row at position i 462 # Add a row at position i
462 df = pd.concat([df.iloc[:i], 463 df = pd.concat([df.iloc[:i],
463 pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, 464 pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start,
...@@ -465,12 +466,17 @@ class Chain: ...@@ -465,12 +466,17 @@ class Chain:
465 df.iloc[i:]]) 466 df.iloc[i:]])
466 # Increase the index_chain of all following lines 467 # Increase the index_chain of all following lines
467 df.iloc[i+1:, 0] += 1 468 df.iloc[i+1:, 0] += 1
469 + #pairs=df[['index_chain', 'nt_resnum', 'nt_id', 'nt_code']]
470 + #print(pairs.iloc[:40])
468 df = df.reset_index(drop=True) 471 df = df.reset_index(drop=True)
472 + #pairs=df[['index_chain', 'nt_resnum', 'nt_id', 'nt_code']]
473 + #print(pairs.iloc[:40])
469 self.full_length = len(df.index_chain) 474 self.full_length = len(df.index_chain)
470 - 475 + #print(df.iloc[0,:])
471 ####################################### 476 #######################################
472 # Compute new features 477 # Compute new features
473 ####################################### 478 #######################################
479 + #print(df[['index_chain', 'nt_resnum', 'nt_id', 'nt_code']])
474 480
475 # Convert angles 481 # Convert angles
476 df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians 482 df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians
...@@ -540,7 +546,7 @@ class Chain: ...@@ -540,7 +546,7 @@ class Chain:
540 pair_type_LW[nt2_idx] += ',' + lw_pair[0] + lw_pair[2] + lw_pair[1] 546 pair_type_LW[nt2_idx] += ',' + lw_pair[0] + lw_pair[2] + lw_pair[1]
541 pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] 547 pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1]
542 paired[nt2_idx] += ',' + str(nt1_idx + 1) 548 paired[nt2_idx] += ',' + str(nt1_idx + 1)
543 - 549 +
544 # transform nt_id to shorter values 550 # transform nt_id to shorter values
545 df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^', '').replace('/', '') for n, name in zip(df.nt_id, df.nt_name) ] 551 df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^', '').replace('/', '') for n, name in zip(df.nt_id, df.nt_name) ]
546 552
...@@ -548,10 +554,10 @@ class Chain: ...@@ -548,10 +554,10 @@ class Chain:
548 df['pair_type_LW'] = pair_type_LW 554 df['pair_type_LW'] = pair_type_LW
549 df['pair_type_DSSR'] = pair_type_DSSR 555 df['pair_type_DSSR'] = pair_type_DSSR
550 df['nb_interact'] = interacts 556 df['nb_interact'] = interacts
551 - 557 + #print(df.iloc[0,:])
552 # remove now useless descriptors 558 # remove now useless descriptors
553 df = df.drop(['nt_id', 'nt_resnum'], axis=1) 559 df = df.drop(['nt_id', 'nt_resnum'], axis=1)
554 - 560 + #print(df.iloc[0,:])
555 self.seq = "".join(df.nt_code) 561 self.seq = "".join(df.nt_code)
556 self.seq_to_align = "".join(df.nt_align_code) 562 self.seq_to_align = "".join(df.nt_align_code)
557 self.length = len([x for x in self.seq_to_align if x != "-"]) 563 self.length = len([x for x in self.seq_to_align if x != "-"])
...@@ -566,7 +572,9 @@ class Chain: ...@@ -566,7 +572,9 @@ class Chain:
566 # Log chain info to file 572 # Log chain info to file
567 if save_logs and self.mapping is not None: 573 if save_logs and self.mapping is not None:
568 self.mapping.to_file(self.chain_label+".log") 574 self.mapping.to_file(self.chain_label+".log")
569 - 575 + #print(df.iloc[0,:])
576 + #pairs=df[['index_chain', 'old_nt_resnum', 'paired']]
577 + #print(pairs.iloc[:40])
570 return df 578 return df
571 579
572 def register_chain(self, df): 580 def register_chain(self, df):
...@@ -904,7 +912,7 @@ class Mapping: ...@@ -904,7 +912,7 @@ class Mapping:
904 912
905 newdf = df.drop(df[(df.nt_resnum < self.nt_start) | 913 newdf = df.drop(df[(df.nt_resnum < self.nt_start) |
906 (df.nt_resnum > self.nt_end)].index) 914 (df.nt_resnum > self.nt_end)].index)
907 - 915 + #print(df.iloc[0,:])
908 if len(newdf.index_chain) > 0: 916 if len(newdf.index_chain) > 0:
909 # everything's okay 917 # everything's okay
910 df = newdf 918 df = newdf
...@@ -917,14 +925,14 @@ class Mapping: ...@@ -917,14 +925,14 @@ class Mapping:
917 weird_mappings.add(self.chain_label + "." + self.rfam_acc) 925 weird_mappings.add(self.chain_label + "." + self.rfam_acc)
918 df = df.drop(df[(df.index_chain < self.nt_start) | 926 df = df.drop(df[(df.index_chain < self.nt_start) |
919 (df.index_chain > self.nt_end)].index) 927 (df.index_chain > self.nt_end)].index)
920 - 928 + #print(df.iloc[0,:])
921 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one 929 # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one
922 self.st = 0 930 self.st = 0
923 if len(df.index_chain) and df.iloc[0, 0] != 1: 931 if len(df.index_chain) and df.iloc[0, 0] != 1:
924 self.st = df.iloc[0, 0] - 1 932 self.st = df.iloc[0, 0] - 1
925 df.iloc[:, 0] -= self.st 933 df.iloc[:, 0] -= self.st
926 self.log(f"Shifting index_chain of {self.st}") 934 self.log(f"Shifting index_chain of {self.st}")
927 - 935 + #print(df.iloc[0,:])
928 # Check that some residues are not included by mistake: 936 # Check that some residues are not included by mistake:
929 # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part, 937 # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part,
930 # because the icode are not read by DSSR. 938 # because the icode are not read by DSSR.
...@@ -2241,7 +2249,7 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): ...@@ -2241,7 +2249,7 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
2241 if not c.delete_me: 2249 if not c.delete_me:
2242 df = c.extract_3D_data(save_logs) 2250 df = c.extract_3D_data(save_logs)
2243 c.register_chain(df) 2251 c.register_chain(df)
2244 - 2252 +
2245 # Small check that all nucleotides of a chain have an entry in nucleotide table 2253 # Small check that all nucleotides of a chain have an entry in nucleotide table
2246 if not c.delete_me: 2254 if not c.delete_me:
2247 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: 2255 with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn:
...@@ -2257,7 +2265,7 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): ...@@ -2257,7 +2265,7 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
2257 # extract the portion we want 2265 # extract the portion we want
2258 if extract and not c.delete_me: 2266 if extract and not c.delete_me:
2259 c.extract(df, khetatm) 2267 c.extract(df, khetatm)
2260 - 2268 + #print(df.iloc[0,:])
2261 return c 2269 return c
2262 2270
2263 @trace_unhandled_exceptions 2271 @trace_unhandled_exceptions
...@@ -2798,9 +2806,11 @@ def work_save(c, homology=True): ...@@ -2798,9 +2806,11 @@ def work_save(c, homology=True):
2798 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM 2806 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM
2799 nucleotide WHERE chain_id = {c.db_chain_id} ORDER BY index_chain ASC;""", 2807 nucleotide WHERE chain_id = {c.db_chain_id} ORDER BY index_chain ASC;""",
2800 conn) 2808 conn)
2809 +
2801 filename = path_to_3D_data + "datapoints/" + c.chain_label 2810 filename = path_to_3D_data + "datapoints/" + c.chain_label
2802 conn.close() 2811 conn.close()
2803 - 2812 + pairs=df[['index_chain', 'old_nt_resnum', 'paired']]
2813 + print(pairs.iloc[:40])
2804 df.to_csv(filename, float_format="%.2f", index=False) 2814 df.to_csv(filename, float_format="%.2f", index=False)
2805 2815
2806 if __name__ == "__main__": 2816 if __name__ == "__main__":
......