Showing
1 changed file
with
28 additions
and
18 deletions
... | @@ -293,7 +293,7 @@ class Chain: | ... | @@ -293,7 +293,7 @@ class Chain: |
293 | "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base", | 293 | "epsilon_zeta", "bb_type", "chi", "glyco_bond", "form", "ssZp", "Dp", "eta", "theta", "eta_prime", "theta_prime", "eta_base", "theta_base", |
294 | "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"] | 294 | "v0", "v1", "v2", "v3", "v4", "amplitude", "phase_angle", "puckering"] |
295 | df = df[cols_we_keep] | 295 | df = df[cols_we_keep] |
296 | - | 296 | + #print(df.iloc[0,:]) |
297 | except KeyError as e: | 297 | except KeyError as e: |
298 | warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) | 298 | warn(f"Error while parsing DSSR {self.pdb_id}.json output:{e}", error=True) |
299 | self.delete_me = True | 299 | self.delete_me = True |
... | @@ -358,7 +358,7 @@ class Chain: | ... | @@ -358,7 +358,7 @@ class Chain: |
358 | self.delete_me = True | 358 | self.delete_me = True |
359 | self.error_messages = f"Error with parsing of duplicate residues numbers." | 359 | self.error_messages = f"Error with parsing of duplicate residues numbers." |
360 | return None | 360 | return None |
361 | - | 361 | + #print(df.iloc[0,:]) |
362 | # Search for ligands at the end of the selection | 362 | # Search for ligands at the end of the selection |
363 | # Drop ligands detected as residues by DSSR, by detecting several markers | 363 | # Drop ligands detected as residues by DSSR, by detecting several markers |
364 | while ( | 364 | while ( |
... | @@ -376,7 +376,7 @@ class Chain: | ... | @@ -376,7 +376,7 @@ class Chain: |
376 | self.mapping.log("Droping ligand:") | 376 | self.mapping.log("Droping ligand:") |
377 | self.mapping.log(df.tail(1)) | 377 | self.mapping.log(df.tail(1)) |
378 | df = df.head(-1) | 378 | df = df.head(-1) |
379 | - | 379 | + #print(df.iloc[0,:]) |
380 | # Duplicates in index_chain : drop, they are ligands | 380 | # Duplicates in index_chain : drop, they are ligands |
381 | # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) | 381 | # e.g. 3iwn_1_B_1-91, ligand C2E has index_chain 1 (and nt_resnum 601) |
382 | duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] | 382 | duplicates = [ index for index, element in enumerate(df.duplicated(['index_chain']).values) if element ] |
... | @@ -386,7 +386,7 @@ class Chain: | ... | @@ -386,7 +386,7 @@ class Chain: |
386 | if self.mapping is not None: | 386 | if self.mapping is not None: |
387 | self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.") | 387 | self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.") |
388 | df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain | 388 | df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain |
389 | - | 389 | + #print(df.iloc[0,:]) |
390 | # drop eventual nts with index_chain < the first residue, | 390 | # drop eventual nts with index_chain < the first residue, |
391 | # now negative because we renumber to 1 (usually, ligands) | 391 | # now negative because we renumber to 1 (usually, ligands) |
392 | ligands = df[df.index_chain < 0] | 392 | ligands = df[df.index_chain < 0] |
... | @@ -396,7 +396,7 @@ class Chain: | ... | @@ -396,7 +396,7 @@ class Chain: |
396 | self.mapping.log("Droping ligand:") | 396 | self.mapping.log("Droping ligand:") |
397 | self.mapping.log(line) | 397 | self.mapping.log(line) |
398 | df = df.drop(ligands.index) | 398 | df = df.drop(ligands.index) |
399 | - | 399 | + #print(df.iloc[0,:]) |
400 | # Find missing index_chain values | 400 | # Find missing index_chain values |
401 | # This happens because of resolved nucleotides that have a | 401 | # This happens because of resolved nucleotides that have a |
402 | # strange nt_resnum value. Thanks, biologists ! :@ :( | 402 | # strange nt_resnum value. Thanks, biologists ! :@ :( |
... | @@ -422,7 +422,7 @@ class Chain: | ... | @@ -422,7 +422,7 @@ class Chain: |
422 | df.iloc[i+1:, 1] += 1 | 422 | df.iloc[i+1:, 1] += 1 |
423 | else: | 423 | else: |
424 | warn(f"Missing index_chain {i} in {self.chain_label} !") | 424 | warn(f"Missing index_chain {i} in {self.chain_label} !") |
425 | - | 425 | + #print(df.iloc[0,:]) |
426 | # Assert some nucleotides still exist | 426 | # Assert some nucleotides still exist |
427 | try: | 427 | try: |
428 | # update length of chain from nt_resnum point of view | 428 | # update length of chain from nt_resnum point of view |
... | @@ -452,12 +452,13 @@ class Chain: | ... | @@ -452,12 +452,13 @@ class Chain: |
452 | # index_chain 1 |-------------|77 83|------------| 154 | 452 | # index_chain 1 |-------------|77 83|------------| 154 |
453 | # expected data point 1 |--------------------------------| 154 | 453 | # expected data point 1 |--------------------------------| 154 |
454 | # | 454 | # |
455 | - | 455 | + #print(df[['index_chain', 'nt_resnum', 'nt_id', 'nt_code']]) |
456 | if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l | 456 | if l != len(df['index_chain']): # if some residues are missing, len(df['index_chain']) < l |
457 | resnum_start = df.iloc[0, 1] | 457 | resnum_start = df.iloc[0, 1] |
458 | # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start) | 458 | # the rowIDs the missing nucleotides would have (rowID = index_chain - 1 = nt_resnum - resnum_start) |
459 | diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) | 459 | diff = set(range(l)).difference(df['nt_resnum'] - resnum_start) |
460 | for i in sorted(diff): | 460 | for i in sorted(diff): |
461 | + #print(i) | ||
461 | # Add a row at position i | 462 | # Add a row at position i |
462 | df = pd.concat([df.iloc[:i], | 463 | df = pd.concat([df.iloc[:i], |
463 | pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, | 464 | pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, |
... | @@ -465,12 +466,17 @@ class Chain: | ... | @@ -465,12 +466,17 @@ class Chain: |
465 | df.iloc[i:]]) | 466 | df.iloc[i:]]) |
466 | # Increase the index_chain of all following lines | 467 | # Increase the index_chain of all following lines |
467 | df.iloc[i+1:, 0] += 1 | 468 | df.iloc[i+1:, 0] += 1 |
469 | + #pairs=df[['index_chain', 'nt_resnum', 'nt_id', 'nt_code']] | ||
470 | + #print(pairs.iloc[:40]) | ||
468 | df = df.reset_index(drop=True) | 471 | df = df.reset_index(drop=True) |
472 | + #pairs=df[['index_chain', 'nt_resnum', 'nt_id', 'nt_code']] | ||
473 | + #print(pairs.iloc[:40]) | ||
469 | self.full_length = len(df.index_chain) | 474 | self.full_length = len(df.index_chain) |
470 | - | 475 | + #print(df.iloc[0,:]) |
471 | ####################################### | 476 | ####################################### |
472 | # Compute new features | 477 | # Compute new features |
473 | ####################################### | 478 | ####################################### |
479 | + #print(df[['index_chain', 'nt_resnum', 'nt_id', 'nt_code']]) | ||
474 | 480 | ||
475 | # Convert angles | 481 | # Convert angles |
476 | df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians | 482 | df.loc[:, ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'epsilon_zeta', 'chi', 'v0', 'v1', 'v2', 'v3', 'v4', # Conversion to radians |
... | @@ -540,7 +546,7 @@ class Chain: | ... | @@ -540,7 +546,7 @@ class Chain: |
540 | pair_type_LW[nt2_idx] += ',' + lw_pair[0] + lw_pair[2] + lw_pair[1] | 546 | pair_type_LW[nt2_idx] += ',' + lw_pair[0] + lw_pair[2] + lw_pair[1] |
541 | pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] | 547 | pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] |
542 | paired[nt2_idx] += ',' + str(nt1_idx + 1) | 548 | paired[nt2_idx] += ',' + str(nt1_idx + 1) |
543 | - | 549 | + |
544 | # transform nt_id to shorter values | 550 | # transform nt_id to shorter values |
545 | df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^', '').replace('/', '') for n, name in zip(df.nt_id, df.nt_name) ] | 551 | df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^', '').replace('/', '') for n, name in zip(df.nt_id, df.nt_name) ] |
546 | 552 | ||
... | @@ -548,10 +554,10 @@ class Chain: | ... | @@ -548,10 +554,10 @@ class Chain: |
548 | df['pair_type_LW'] = pair_type_LW | 554 | df['pair_type_LW'] = pair_type_LW |
549 | df['pair_type_DSSR'] = pair_type_DSSR | 555 | df['pair_type_DSSR'] = pair_type_DSSR |
550 | df['nb_interact'] = interacts | 556 | df['nb_interact'] = interacts |
551 | - | 557 | + #print(df.iloc[0,:]) |
552 | # remove now useless descriptors | 558 | # remove now useless descriptors |
553 | df = df.drop(['nt_id', 'nt_resnum'], axis=1) | 559 | df = df.drop(['nt_id', 'nt_resnum'], axis=1) |
554 | - | 560 | + #print(df.iloc[0,:]) |
555 | self.seq = "".join(df.nt_code) | 561 | self.seq = "".join(df.nt_code) |
556 | self.seq_to_align = "".join(df.nt_align_code) | 562 | self.seq_to_align = "".join(df.nt_align_code) |
557 | self.length = len([x for x in self.seq_to_align if x != "-"]) | 563 | self.length = len([x for x in self.seq_to_align if x != "-"]) |
... | @@ -566,7 +572,9 @@ class Chain: | ... | @@ -566,7 +572,9 @@ class Chain: |
566 | # Log chain info to file | 572 | # Log chain info to file |
567 | if save_logs and self.mapping is not None: | 573 | if save_logs and self.mapping is not None: |
568 | self.mapping.to_file(self.chain_label+".log") | 574 | self.mapping.to_file(self.chain_label+".log") |
569 | - | 575 | + #print(df.iloc[0,:]) |
576 | + #pairs=df[['index_chain', 'old_nt_resnum', 'paired']] | ||
577 | + #print(pairs.iloc[:40]) | ||
570 | return df | 578 | return df |
571 | 579 | ||
572 | def register_chain(self, df): | 580 | def register_chain(self, df): |
... | @@ -904,7 +912,7 @@ class Mapping: | ... | @@ -904,7 +912,7 @@ class Mapping: |
904 | 912 | ||
905 | newdf = df.drop(df[(df.nt_resnum < self.nt_start) | | 913 | newdf = df.drop(df[(df.nt_resnum < self.nt_start) | |
906 | (df.nt_resnum > self.nt_end)].index) | 914 | (df.nt_resnum > self.nt_end)].index) |
907 | - | 915 | + #print(df.iloc[0,:]) |
908 | if len(newdf.index_chain) > 0: | 916 | if len(newdf.index_chain) > 0: |
909 | # everything's okay | 917 | # everything's okay |
910 | df = newdf | 918 | df = newdf |
... | @@ -917,14 +925,14 @@ class Mapping: | ... | @@ -917,14 +925,14 @@ class Mapping: |
917 | weird_mappings.add(self.chain_label + "." + self.rfam_acc) | 925 | weird_mappings.add(self.chain_label + "." + self.rfam_acc) |
918 | df = df.drop(df[(df.index_chain < self.nt_start) | | 926 | df = df.drop(df[(df.index_chain < self.nt_start) | |
919 | (df.index_chain > self.nt_end)].index) | 927 | (df.index_chain > self.nt_end)].index) |
920 | - | 928 | + #print(df.iloc[0,:]) |
921 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one | 929 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one |
922 | self.st = 0 | 930 | self.st = 0 |
923 | if len(df.index_chain) and df.iloc[0, 0] != 1: | 931 | if len(df.index_chain) and df.iloc[0, 0] != 1: |
924 | self.st = df.iloc[0, 0] - 1 | 932 | self.st = df.iloc[0, 0] - 1 |
925 | df.iloc[:, 0] -= self.st | 933 | df.iloc[:, 0] -= self.st |
926 | self.log(f"Shifting index_chain of {self.st}") | 934 | self.log(f"Shifting index_chain of {self.st}") |
927 | - | 935 | + #print(df.iloc[0,:]) |
928 | # Check that some residues are not included by mistake: | 936 | # Check that some residues are not included by mistake: |
929 | # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part, | 937 | # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part, |
930 | # because the icode are not read by DSSR. | 938 | # because the icode are not read by DSSR. |
... | @@ -2241,7 +2249,7 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | ... | @@ -2241,7 +2249,7 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
2241 | if not c.delete_me: | 2249 | if not c.delete_me: |
2242 | df = c.extract_3D_data(save_logs) | 2250 | df = c.extract_3D_data(save_logs) |
2243 | c.register_chain(df) | 2251 | c.register_chain(df) |
2244 | - | 2252 | + |
2245 | # Small check that all nucleotides of a chain have an entry in nucleotide table | 2253 | # Small check that all nucleotides of a chain have an entry in nucleotide table |
2246 | if not c.delete_me: | 2254 | if not c.delete_me: |
2247 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 2255 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
... | @@ -2257,7 +2265,7 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): | ... | @@ -2257,7 +2265,7 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True): |
2257 | # extract the portion we want | 2265 | # extract the portion we want |
2258 | if extract and not c.delete_me: | 2266 | if extract and not c.delete_me: |
2259 | c.extract(df, khetatm) | 2267 | c.extract(df, khetatm) |
2260 | - | 2268 | + #print(df.iloc[0,:]) |
2261 | return c | 2269 | return c |
2262 | 2270 | ||
2263 | @trace_unhandled_exceptions | 2271 | @trace_unhandled_exceptions |
... | @@ -2798,9 +2806,11 @@ def work_save(c, homology=True): | ... | @@ -2798,9 +2806,11 @@ def work_save(c, homology=True): |
2798 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM | 2806 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM |
2799 | nucleotide WHERE chain_id = {c.db_chain_id} ORDER BY index_chain ASC;""", | 2807 | nucleotide WHERE chain_id = {c.db_chain_id} ORDER BY index_chain ASC;""", |
2800 | conn) | 2808 | conn) |
2809 | + | ||
2801 | filename = path_to_3D_data + "datapoints/" + c.chain_label | 2810 | filename = path_to_3D_data + "datapoints/" + c.chain_label |
2802 | conn.close() | 2811 | conn.close() |
2803 | - | 2812 | + pairs=df[['index_chain', 'old_nt_resnum', 'paired']] |
2813 | + print(pairs.iloc[:40]) | ||
2804 | df.to_csv(filename, float_format="%.2f", index=False) | 2814 | df.to_csv(filename, float_format="%.2f", index=False) |
2805 | 2815 | ||
2806 | if __name__ == "__main__": | 2816 | if __name__ == "__main__": | ... | ... |
-
Please register or login to post a comment