Reordered steps in 3D data extraction + removed inverted numbering support
Showing
3 changed files
with
132 additions
and
127 deletions
... | @@ -52,7 +52,7 @@ class SelectivePortionSelector(object): | ... | @@ -52,7 +52,7 @@ class SelectivePortionSelector(object): |
52 | 52 | ||
53 | def __init__(self, model_id, chain_id, valid_resnums, khetatm): | 53 | def __init__(self, model_id, chain_id, valid_resnums, khetatm): |
54 | self.chain_id = chain_id | 54 | self.chain_id = chain_id |
55 | - self.resnums = valid_resnums | 55 | + self.resnums = valid_resnums # list of strings, that are mostly ints |
56 | self.pdb_model_id = model_id | 56 | self.pdb_model_id = model_id |
57 | self.hydrogen_regex = re.compile("[123 ]*H.*") | 57 | self.hydrogen_regex = re.compile("[123 ]*H.*") |
58 | self.keep_hetatm = khetatm | 58 | self.keep_hetatm = khetatm |
... | @@ -70,15 +70,12 @@ class SelectivePortionSelector(object): | ... | @@ -70,15 +70,12 @@ class SelectivePortionSelector(object): |
70 | if hetatm_flag in ["W", "H_MG"]: | 70 | if hetatm_flag in ["W", "H_MG"]: |
71 | return int(self.keep_hetatm) | 71 | return int(self.keep_hetatm) |
72 | 72 | ||
73 | - # I don't really know what this is but the doc said to warn: | ||
74 | - if icode != " ": | ||
75 | - pass | ||
76 | - # warn(f"icode {icode} at position {resseq}\t\t") | ||
77 | - | ||
78 | # Accept the residue if it is in the right interval: | 73 | # Accept the residue if it is in the right interval: |
79 | - if len(self.resnums): | 74 | + if icode == " " and len(self.resnums): |
80 | - return int(resseq in self.resnums) | 75 | + return int(str(resseq) in self.resnums) |
81 | - else: | 76 | + elif icode != " " and len(self.resnums): |
77 | + return int(str(resseq)+icode in self.resnums) | ||
78 | + else: # len(resnum) == 0, we don't use mappings (--no-homology option) | ||
82 | return 1 | 79 | return 1 |
83 | 80 | ||
84 | def accept_atom(self, atom): | 81 | def accept_atom(self, atom): |
... | @@ -153,7 +150,7 @@ class Chain: | ... | @@ -153,7 +150,7 @@ class Chain: |
153 | def __hash__(self): | 150 | def __hash__(self): |
154 | return hash((self.pdb_id, self.pdb_model, self.pdb_chain_id, self.chain_label)) | 151 | return hash((self.pdb_id, self.pdb_model, self.pdb_chain_id, self.chain_label)) |
155 | 152 | ||
156 | - def extract(self, khetatm): | 153 | + def extract(self, df, khetatm): |
157 | """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. | 154 | """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file. |
158 | """ | 155 | """ |
159 | 156 | ||
... | @@ -180,12 +177,8 @@ class Chain: | ... | @@ -180,12 +177,8 @@ class Chain: |
180 | mmcif_parser = MMCIFParser() | 177 | mmcif_parser = MMCIFParser() |
181 | s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") | 178 | s = mmcif_parser.get_structure(self.pdb_id, path_to_3D_data + "RNAcifs/"+self.pdb_id+".cif") |
182 | 179 | ||
183 | - | ||
184 | - # Extract the desired chain | ||
185 | - c = s[model_idx][self.pdb_chain_id] | ||
186 | - | ||
187 | if self.mapping is not None: | 180 | if self.mapping is not None: |
188 | - valid_set = set(self.mapping.old_nt_resnums) | 181 | + valid_set = set(df.old_nt_resnum) |
189 | else: | 182 | else: |
190 | valid_set = set() | 183 | valid_set = set() |
191 | 184 | ||
... | @@ -262,17 +255,32 @@ class Chain: | ... | @@ -262,17 +255,32 @@ class Chain: |
262 | # Select the nucleotides we need | 255 | # Select the nucleotides we need |
263 | ############################################# | 256 | ############################################# |
264 | 257 | ||
265 | - | ||
266 | # Remove nucleotides of the chain that are outside the Rfam mapping, if any | 258 | # Remove nucleotides of the chain that are outside the Rfam mapping, if any |
267 | if self.mapping is not None: | 259 | if self.mapping is not None: |
260 | + if self.mapping.nt_start > self.mapping.nt_end: | ||
261 | + warn(f"Mapping is reversed, this case is not supported (yet). Ignoring chain {self.chain_label}.") | ||
262 | + self.delete_me = True | ||
263 | + self.error_messages = f"Mapping is reversed, this case is not supported (yet)." | ||
264 | + return None | ||
268 | df = self.mapping.filter_df(df) | 265 | df = self.mapping.filter_df(df) |
269 | 266 | ||
270 | # Duplicate residue numbers : shift numbering | 267 | # Duplicate residue numbers : shift numbering |
271 | - # Example: 4v9q-DV contains 17 and 17A which are both read 17 by DSSR. | ||
272 | while True in df.duplicated(['nt_resnum']).values: | 268 | while True in df.duplicated(['nt_resnum']).values: |
273 | i = df.duplicated(['nt_resnum']).values.tolist().index(True) | 269 | i = df.duplicated(['nt_resnum']).values.tolist().index(True) |
274 | - self.mapping.shift_resnum_range(i) | 270 | + resnumlist = df.nt_resnum.tolist() |
271 | + if self.mapping is not None: | ||
272 | + self.mapping.log(f"Shifting nt_resnum numbering because of duplicate residue {resnumlist[i]}") | ||
273 | + | ||
274 | + if resnumlist[i] == resnumlist[i-1]: | ||
275 | + # Common 4v9q-DV case : e.g. chains contains 17 and 17A which are both read 17 by DSSR. | ||
276 | + # Solution : we shift the numbering of 17A (to 18) and the following residues. | ||
275 | df.iloc[i:, 1] += 1 | 277 | df.iloc[i:, 1] += 1 |
278 | + else: | ||
279 | + # 4v9k-DA case : the nt_id is not the full nt_resnum: ... 1629 > 1630 > 163B > 1631 > ... | ||
280 | + # Here the 163B is read 163 by DSSR, but there already is a residue 163. | ||
281 | + # Solution : set nt_resnum[i] to nt_resnum[i-1] + 1, and shift the following by 1. | ||
282 | + df.iloc[i, 1] = 1 + df.iloc[i-1, 1] | ||
283 | + df.iloc[i+1:, 1] += 1 | ||
276 | 284 | ||
277 | # Search for ligands at the end of the selection | 285 | # Search for ligands at the end of the selection |
278 | # Drop ligands detected as residues by DSSR, by detecting several markers | 286 | # Drop ligands detected as residues by DSSR, by detecting several markers |
... | @@ -280,11 +288,12 @@ class Chain: | ... | @@ -280,11 +288,12 @@ class Chain: |
280 | (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all() | 288 | (df.iloc[[-1]][["alpha", "beta", "gamma", "delta", "epsilon", "zeta", "v0", "v1", "v2", "v3", "v4"]].isna().values).all() |
281 | or (df.iloc[[-1]].puckering=='').any() | 289 | or (df.iloc[[-1]].puckering=='').any() |
282 | ) | 290 | ) |
283 | - or ( len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1] ) | 291 | + or ( len(df.index_chain) >= 2 and df.iloc[-1,1] > 50 + df.iloc[-2,1] ) # large nt_resnum gap between the two last residues |
284 | or ( len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"] ) | 292 | or ( len(df.index_chain) and df.iloc[-1,2] in ["GNG", "E2C", "OHX", "IRI", "MPD", "8UZ"] ) |
285 | ): | 293 | ): |
286 | if self.mapping is not None: | 294 | if self.mapping is not None: |
287 | - self.mapping.drop_ligand(df.tail(1)) | 295 | + self.mapping.log("Droping ligand:") |
296 | + self.mapping.log(df.tail(1)) | ||
288 | df = df.head(-1) | 297 | df = df.head(-1) |
289 | 298 | ||
290 | # Duplicates in index_chain : drop, they are ligands | 299 | # Duplicates in index_chain : drop, they are ligands |
... | @@ -295,8 +304,6 @@ class Chain: | ... | @@ -295,8 +304,6 @@ class Chain: |
295 | warn(f"Found duplicated index_chain {df.iloc[i,0]} in {self.chain_label}. Keeping only the first.") | 304 | warn(f"Found duplicated index_chain {df.iloc[i,0]} in {self.chain_label}. Keeping only the first.") |
296 | if self.mapping is not None: | 305 | if self.mapping is not None: |
297 | self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.") | 306 | self.mapping.log(f"Found duplicated index_chain {df.iloc[i,0]}. Keeping only the first.") |
298 | - with open("duplicates.txt", "a") as f: | ||
299 | - f.write(f"DEBUG: {self.chain_label} has duplicate index_chains !\n") | ||
300 | df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain | 307 | df = df.drop_duplicates("index_chain", keep="first") # drop doublons in index_chain |
301 | 308 | ||
302 | # drop eventual nts with index_chain < the first residue, | 309 | # drop eventual nts with index_chain < the first residue, |
... | @@ -305,30 +312,31 @@ class Chain: | ... | @@ -305,30 +312,31 @@ class Chain: |
305 | if len(ligands.index_chain): | 312 | if len(ligands.index_chain): |
306 | if self.mapping is not None: | 313 | if self.mapping is not None: |
307 | for line in ligands.iterrows(): | 314 | for line in ligands.iterrows(): |
308 | - self.mapping.drop_ligand(line) | 315 | + self.mapping.log("Droping ligand:") |
316 | + self.mapping.log(line) | ||
309 | df = df.drop(ligands.index) | 317 | df = df.drop(ligands.index) |
310 | 318 | ||
311 | # Find missing index_chain values | 319 | # Find missing index_chain values |
312 | # This happens because of resolved nucleotides that have a | 320 | # This happens because of resolved nucleotides that have a |
313 | - # strange nt_resnum value | 321 | + # strange nt_resnum value. Thanks, biologists ! :@ :( |
314 | # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3' | 322 | # e.g. 4v49-AA, position 5'- 1003 -> 2003 -> 1004 - 3' |
315 | diff = set(range(df.shape[0])).difference(df['index_chain'] - 1) | 323 | diff = set(range(df.shape[0])).difference(df['index_chain'] - 1) |
316 | - if len(diff): | 324 | + if len(diff) and self.mapping is not None: |
317 | - warn(f"Missing residues regarding index_chain: {[1+i for i in sorted(diff)]}") | 325 | + # warn(f"Missing residues in chain numbering: {[1+i for i in sorted(diff)]}") |
318 | for i in sorted(diff): | 326 | for i in sorted(diff): |
319 | - # check if a nucleotide numbered +1000 exists in the nts object | 327 | + # check if a nucleotide with the correct index_chain exists in the nts object |
320 | found = None | 328 | found = None |
321 | for nt in nts: # nts is the object from the loaded JSON and contains all nts | 329 | for nt in nts: # nts is the object from the loaded JSON and contains all nts |
322 | if nt['chain_name'] != self.pdb_chain_id: | 330 | if nt['chain_name'] != self.pdb_chain_id: |
323 | continue | 331 | continue |
324 | - if nt['index_chain'] == i + 1 : | 332 | + if nt['index_chain'] == i + 1 + self.mapping.st: |
325 | found = nt | 333 | found = nt |
326 | break | 334 | break |
327 | if found: | 335 | if found: |
336 | + self.mapping.log(f"Residue {i+1+self.mapping.st}-{self.mapping.st} = {i+1} has been saved and renumbered {df.iloc[i,1]} instead of {found['nt_id'].replace(found['chain_name']+ '.' + found['nt_name'], '').replace('^','')}") | ||
328 | df_row = pd.DataFrame([found], index=[i])[df.columns.values] | 337 | df_row = pd.DataFrame([found], index=[i])[df.columns.values] |
329 | - if self.mapping is not None: | 338 | + df_row.iloc[0,0] = i+1 # index_chain |
330 | - self.mapping.insert_new(i+1, found['nt_resnum'], df.iloc[i,1]) | 339 | + df_row.iloc[0,1] = df.iloc[i,1] # nt_resnum |
331 | - df_row.iloc[0,1] = df.iloc[i,1] | ||
332 | df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ]) | 340 | df = pd.concat([ df.iloc[:i], df_row, df.iloc[i:] ]) |
333 | df.iloc[i+1:, 1] += 1 | 341 | df.iloc[i+1:, 1] += 1 |
334 | else: | 342 | else: |
... | @@ -369,7 +377,7 @@ class Chain: | ... | @@ -369,7 +377,7 @@ class Chain: |
369 | for i in sorted(diff): | 377 | for i in sorted(diff): |
370 | # Add a row at position i | 378 | # Add a row at position i |
371 | df = pd.concat([ df.iloc[:i], | 379 | df = pd.concat([ df.iloc[:i], |
372 | - pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, "nt_code":'-', "nt_name":'-'}, index=[i]), | 380 | + pd.DataFrame({"index_chain": i+1, "nt_resnum": i+resnum_start, "nt_id":"not resolved", "nt_code":'-', "nt_name":'-'}, index=[i]), |
373 | df.iloc[i:] ]) | 381 | df.iloc[i:] ]) |
374 | # Increase the index_chain of all following lines | 382 | # Increase the index_chain of all following lines |
375 | df.iloc[i+1:, 0] += 1 | 383 | df.iloc[i+1:, 0] += 1 |
... | @@ -424,11 +432,11 @@ class Chain: | ... | @@ -424,11 +432,11 @@ class Chain: |
424 | if paired[nt1_idx] == "": | 432 | if paired[nt1_idx] == "": |
425 | pair_type_LW[nt1_idx] = lw_pair | 433 | pair_type_LW[nt1_idx] = lw_pair |
426 | pair_type_DSSR[nt1_idx] = dssr_pair | 434 | pair_type_DSSR[nt1_idx] = dssr_pair |
427 | - paired[nt1_idx] = str(nt2_idx + 1) | 435 | + paired[nt1_idx] = str(nt2_idx + 1) # index + 1 is actually index_chain. |
428 | else: | 436 | else: |
429 | pair_type_LW[nt1_idx] += ',' + lw_pair | 437 | pair_type_LW[nt1_idx] += ',' + lw_pair |
430 | pair_type_DSSR[nt1_idx] += ',' + dssr_pair | 438 | pair_type_DSSR[nt1_idx] += ',' + dssr_pair |
431 | - paired[nt1_idx] += ',' + str(nt2_idx + 1) | 439 | + paired[nt1_idx] += ',' + str(nt2_idx + 1) # index + 1 is actually index_chain. |
432 | 440 | ||
433 | # set nucleotide 2 with the opposite base-pair | 441 | # set nucleotide 2 with the opposite base-pair |
434 | if nt2 in res_ids: | 442 | if nt2 in res_ids: |
... | @@ -442,43 +450,14 @@ class Chain: | ... | @@ -442,43 +450,14 @@ class Chain: |
442 | pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] | 450 | pair_type_DSSR[nt2_idx] += ',' + dssr_pair[0] + dssr_pair[3] + dssr_pair[2] + dssr_pair[1] |
443 | paired[nt2_idx] += ',' + str(nt1_idx + 1) | 451 | paired[nt2_idx] += ',' + str(nt1_idx + 1) |
444 | 452 | ||
453 | + # transform nt_id to shorter values | ||
454 | + df['old_nt_resnum'] = [ n.replace(self.pdb_chain_id+'.'+name, '').replace('^','') for n, name in zip(df.nt_id, df.nt_name) ] | ||
455 | + | ||
445 | df['paired'] = paired | 456 | df['paired'] = paired |
446 | df['pair_type_LW'] = pair_type_LW | 457 | df['pair_type_LW'] = pair_type_LW |
447 | df['pair_type_DSSR'] = pair_type_DSSR | 458 | df['pair_type_DSSR'] = pair_type_DSSR |
448 | df['nb_interact'] = interacts | 459 | df['nb_interact'] = interacts |
449 | - df = df.drop(['nt_id'], axis=1) # remove now useless descriptors | 460 | + df = df.drop(['nt_id', 'nt_resnum'], axis=1) # remove now useless descriptors |
450 | - | ||
451 | - if self.mapping.reversed: | ||
452 | - # The 3D structure is numbered from 3' to 5' instead of standard 5' to 3' | ||
453 | - # or the sequence that matches the Rfam family is 3' to 5' instead of standard 5' to 3'. | ||
454 | - # Anyways, you need to invert the angles. | ||
455 | - # TODO: angles alpha, beta, etc | ||
456 | - warn(f"Has {self.chain_label} been numbered from 3' to 5' ? Inverting pseudotorsions, other angle measures are not corrected.") | ||
457 | - df = df.reindex(index=df.index[::-1]).reset_index(drop=True) | ||
458 | - df['index_chain'] = 1 + df.index | ||
459 | - temp_eta = df['eta'] | ||
460 | - df['eta'] = [ df['theta'][n] for n in range(l) ] # eta(n) = theta(l-n+1) forall n in ]1, l] | ||
461 | - df['theta'] = [ temp_eta[n] for n in range(l) ] # theta(n) = eta(l-n+1) forall n in [1, l[ | ||
462 | - temp_eta = df['eta_prime'] | ||
463 | - df['eta_prime'] = [ df['theta_prime'][n] for n in range(l) ] # eta(n) = theta(l-n+1) forall n in ]1, l] | ||
464 | - df['theta_prime'] = [ temp_eta[n] for n in range(l) ] # theta(n) = eta(l-n+1) forall n in [1, l[ | ||
465 | - temp_eta = df['eta_base'] | ||
466 | - df['eta_base'] = [ df['theta_base'][n] for n in range(l) ] # eta(n) = theta(l-n+1) forall n in ]1, l] | ||
467 | - df['theta_base'] = [ temp_eta[n] for n in range(l) ] # theta(n) = eta(l-n+1) forall n in [1, l[ | ||
468 | - newpairs = [] | ||
469 | - for v in df['paired']: | ||
470 | - if ',' in v: | ||
471 | - temp_v = [] | ||
472 | - vs = v.split(',') | ||
473 | - for _ in vs: | ||
474 | - temp_v.append(str(l-int(_)+1) if int(_) else _ ) | ||
475 | - newpairs.append(','.join(temp_v)) | ||
476 | - else: | ||
477 | - if len(v): | ||
478 | - newpairs.append(str(l-int(v)+1) if int(v) else v ) | ||
479 | - else: # means unpaired | ||
480 | - newpairs.append(v) | ||
481 | - df['paired'] = newpairs | ||
482 | 461 | ||
483 | self.seq = "".join(df.nt_code) | 462 | self.seq = "".join(df.nt_code) |
484 | self.seq_to_align = "".join(df.nt_align_code) | 463 | self.seq_to_align = "".join(df.nt_align_code) |
... | @@ -503,20 +482,19 @@ class Chain: | ... | @@ -503,20 +482,19 @@ class Chain: |
503 | 482 | ||
504 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: | 483 | with sqlite3.connect(runDir+"/results/RNANet.db", timeout=10.0) as conn: |
505 | # Register the chain in table chain | 484 | # Register the chain in table chain |
506 | - if self.mapping.nt_start is not None: | 485 | + if self.mapping is not None: |
507 | sql_execute(conn, f""" INSERT INTO chain | 486 | sql_execute(conn, f""" INSERT INTO chain |
508 | - (structure_id, chain_name, pdb_start, pdb_end, reversed, rfam_acc, inferred, issue) | 487 | + (structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, issue) |
509 | VALUES | 488 | VALUES |
510 | - (?, ?, ?, ?, ?, ?, ?, ?) | 489 | + (?, ?, ?, ?, ?, ?, ?) |
511 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO | 490 | ON CONFLICT(structure_id, chain_name, rfam_acc) DO |
512 | UPDATE SET pdb_start=excluded.pdb_start, | 491 | UPDATE SET pdb_start=excluded.pdb_start, |
513 | pdb_end=excluded.pdb_end, | 492 | pdb_end=excluded.pdb_end, |
514 | - reversed=excluded.reversed, | ||
515 | inferred=excluded.inferred, | 493 | inferred=excluded.inferred, |
516 | issue=excluded.issue;""", | 494 | issue=excluded.issue;""", |
517 | data=(str(self.pdb_id), str(self.pdb_chain_id), | 495 | data=(str(self.pdb_id), str(self.pdb_chain_id), |
518 | int(self.mapping.nt_start), int(self.mapping.nt_end), | 496 | int(self.mapping.nt_start), int(self.mapping.nt_end), |
519 | - int(self.mapping.reversed), str(self.mapping.rfam_acc), | 497 | + str(self.mapping.rfam_acc), |
520 | int(self.mapping.inferred), int(self.delete_me))) | 498 | int(self.mapping.inferred), int(self.delete_me))) |
521 | # get the chain id | 499 | # get the chain id |
522 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain | 500 | self.db_chain_id = sql_ask_database(conn, f"""SELECT (chain_id) FROM chain |
... | @@ -536,10 +514,10 @@ class Chain: | ... | @@ -536,10 +514,10 @@ class Chain: |
536 | if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know | 514 | if df is not None and not self.delete_me: # double condition is theoretically redundant here, but you never know |
537 | sql_execute(conn, f""" | 515 | sql_execute(conn, f""" |
538 | INSERT OR IGNORE INTO nucleotide | 516 | INSERT OR IGNORE INTO nucleotide |
539 | - (chain_id, index_chain, nt_resnum, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta, | 517 | + (chain_id, index_chain, nt_name, nt_code, dbn, alpha, beta, gamma, delta, epsilon, zeta, |
540 | epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, | 518 | epsilon_zeta, bb_type, chi, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, |
541 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, | 519 | v0, v1, v2, v3, v4, amplitude, phase_angle, puckering, nt_align_code, is_A, is_C, is_G, is_U, is_other, nt_position, |
542 | - paired, pair_type_LW, pair_type_DSSR, nb_interact) | 520 | + old_nt_resnum, paired, pair_type_LW, pair_type_DSSR, nb_interact) |
543 | VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, | 521 | VALUES ({self.db_chain_id}, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, |
544 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, | 522 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, |
545 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", | 523 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);""", |
... | @@ -890,42 +868,13 @@ class Mapping: | ... | @@ -890,42 +868,13 @@ class Mapping: |
890 | self.rfam_acc = rfam_acc | 868 | self.rfam_acc = rfam_acc |
891 | self.nt_start = pdb_start # nt_resnum numbering | 869 | self.nt_start = pdb_start # nt_resnum numbering |
892 | self.nt_end = pdb_end # nt_resnum numbering | 870 | self.nt_end = pdb_end # nt_resnum numbering |
893 | - self.reversed = (pdb_start > pdb_end) | ||
894 | self.inferred = inferred | 871 | self.inferred = inferred |
895 | - self.interval = range(pdb_start, pdb_end+1) | ||
896 | - | ||
897 | - self.old_nt_resnums = [] # to be computed | ||
898 | - self.new_nt_resnums = [] # | ||
899 | 872 | ||
900 | self.logs = [] # Events are logged when modifying the mapping | 873 | self.logs = [] # Events are logged when modifying the mapping |
901 | 874 | ||
902 | - def shift_resnum_range(self, i): | ||
903 | - self.log(f"Shifting nt_resnum numbering because of duplicate residue {self.new_nt_resnums[i]}") | ||
904 | - for j in range(i, len(self.new_nt_resnums)): | ||
905 | - self.new_nt_resnums[j] += 1 | ||
906 | - | ||
907 | - def insert_new(self, index_chain, oldresnum, newresnum): | ||
908 | - # Adds a nt that did not passed the mapping filter at first | ||
909 | - # because it was numbered with a very high nt_resnum value (outside the bounds of the mapping) | ||
910 | - # But, in practice, its index_chain is correct and in the bounds and it belongs to the mapped chain. | ||
911 | - # Those nts are only searched if there are missing index_chain values in the mapping bounds. | ||
912 | - | ||
913 | - # insert the nt_resnum values in the lists | ||
914 | - self.old_nt_resnums.insert(index_chain-1, oldresnum) | ||
915 | - self.new_nt_resnums.insert(index_chain-1, newresnum) | ||
916 | - | ||
917 | - # shift the new_nt_resnum values if needed, to avoid creating a doublon | ||
918 | - if self.new_nt_resnums[index_chain-1] == self.new_nt_resnums[index_chain]: | ||
919 | - for j in range(index_chain, len(self.new_nt_resnums)): | ||
920 | - self.new_nt_resnums[j] += 1 | ||
921 | - # warn(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}") | ||
922 | - self.log(f"Residue {index_chain} has been saved and renumbered {newresnum} instead of {oldresnum}") | ||
923 | - | ||
924 | def filter_df(self, df): | 875 | def filter_df(self, df): |
925 | - if not self.reversed: | 876 | + |
926 | newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index) | 877 | newdf = df.drop(df[(df.nt_resnum < self.nt_start) | (df.nt_resnum > self.nt_end)].index) |
927 | - else: | ||
928 | - newdf = df.drop(df[(df.nt_resnum < self.nt_end) | (df.nt_resnum > self.nt_start)].index) | ||
929 | 878 | ||
930 | if len(newdf.index_chain) > 0: | 879 | if len(newdf.index_chain) > 0: |
931 | # everything's okay | 880 | # everything's okay |
... | @@ -939,24 +888,24 @@ class Mapping: | ... | @@ -939,24 +888,24 @@ class Mapping: |
939 | weird_mappings.add(self.chain_label + "." + self.rfam_acc) | 888 | weird_mappings.add(self.chain_label + "." + self.rfam_acc) |
940 | df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index) | 889 | df = df.drop(df[(df.index_chain < self.nt_start) | (df.index_chain > self.nt_end)].index) |
941 | 890 | ||
942 | - | ||
943 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one | 891 | # If, for some reason, index_chain does not start at one (e.g. 6boh, chain GB), make it start at one |
892 | + self.st = 0 | ||
944 | if len(df.index_chain) and df.iloc[0,0] != 1: | 893 | if len(df.index_chain) and df.iloc[0,0] != 1: |
945 | - st = df.iloc[0,0] -1 | 894 | + self.st = df.iloc[0,0] -1 |
946 | - df.iloc[:, 0] -= st | 895 | + df.iloc[:, 0] -= self.st |
947 | - | 896 | + self.log(f"Shifting index_chain of {self.st}") |
948 | - self.old_nt_resnums = df.nt_resnum.tolist() | 897 | + |
949 | - self.new_nt_resnums = df.nt_resnum.tolist() | 898 | + # Check that some residues are not included by mistake: |
899 | + # e.g. 4v4t-AA.RF00382-20-55 contains 4 residues numbered 30 but actually far beyond the mapped part, | ||
900 | + # because the icode are not read by DSSR. | ||
901 | + toremove = df[ df.index_chain > self.nt_end ] | ||
902 | + if not toremove.empty: | ||
903 | + df = df.drop(toremove.index) | ||
904 | + self.log(f"Some nt_resnum values are likely to be wrong, not considering residues:") | ||
905 | + self.log(str(toremove)) | ||
950 | 906 | ||
951 | return df | 907 | return df |
952 | 908 | ||
953 | - def drop_ligand(self, df_row): | ||
954 | - self.log("Droping ligand:") | ||
955 | - self.log(df_row) | ||
956 | - i = self.new_nt_resnums.index(df_row.iloc[0,1]) | ||
957 | - self.old_nt_resnums.pop(i) | ||
958 | - self.new_nt_resnums.pop(i) | ||
959 | - | ||
960 | def log(self, message): | 909 | def log(self, message): |
961 | if isinstance(message, str): | 910 | if isinstance(message, str): |
962 | self.logs.append(message+'\n') | 911 | self.logs.append(message+'\n') |
... | @@ -964,13 +913,15 @@ class Mapping: | ... | @@ -964,13 +913,15 @@ class Mapping: |
964 | self.logs.append(str(message)) | 913 | self.logs.append(str(message)) |
965 | 914 | ||
966 | def to_file(self, filename): | 915 | def to_file(self, filename): |
916 | + if self.logs == []: | ||
917 | + return # Do not create a log file if there is nothing to log | ||
918 | + | ||
967 | if not path.exists("logs"): | 919 | if not path.exists("logs"): |
968 | os.makedirs("logs", exist_ok=True) | 920 | os.makedirs("logs", exist_ok=True) |
969 | with open("logs/"+filename, "w") as f: | 921 | with open("logs/"+filename, "w") as f: |
970 | f.writelines(self.logs) | 922 | f.writelines(self.logs) |
971 | 923 | ||
972 | 924 | ||
973 | - | ||
974 | class Pipeline: | 925 | class Pipeline: |
975 | def __init__(self): | 926 | def __init__(self): |
976 | self.dl = Downloader() | 927 | self.dl = Downloader() |
... | @@ -991,6 +942,7 @@ class Pipeline: | ... | @@ -991,6 +942,7 @@ class Pipeline: |
991 | self.EXTRACT_CHAINS = False | 942 | self.EXTRACT_CHAINS = False |
992 | self.REUSE_ALL = False | 943 | self.REUSE_ALL = False |
993 | self.SELECT_ONLY = None | 944 | self.SELECT_ONLY = None |
945 | + self.ARCHIVE = False | ||
994 | 946 | ||
995 | def process_options(self): | 947 | def process_options(self): |
996 | """Sets the paths and options of the pipeline""" | 948 | """Sets the paths and options of the pipeline""" |
... | @@ -1002,7 +954,7 @@ class Pipeline: | ... | @@ -1002,7 +954,7 @@ class Pipeline: |
1002 | [ "help", "resolution=", "keep-hetatm=", "from-scratch", | 954 | [ "help", "resolution=", "keep-hetatm=", "from-scratch", |
1003 | "fill-gaps=", "3d-folder=", "seq-folder=", | 955 | "fill-gaps=", "3d-folder=", "seq-folder=", |
1004 | "no-homology", "ignore-issues", "extract", "only=", "all", | 956 | "no-homology", "ignore-issues", "extract", "only=", "all", |
1005 | - "update-homologous" ]) | 957 | + "archive", "update-homologous" ]) |
1006 | except getopt.GetoptError as err: | 958 | except getopt.GetoptError as err: |
1007 | print(err) | 959 | print(err) |
1008 | sys.exit(2) | 960 | sys.exit(2) |
... | @@ -1043,9 +995,10 @@ class Pipeline: | ... | @@ -1043,9 +995,10 @@ class Pipeline: |
1043 | print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them") | 995 | print("--ignore-issues\t\t\tDo not ignore already known issues and attempt to compute them") |
1044 | print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files") | 996 | print("--update-homologous\t\tRe-download Rfam and SILVA databases, realign all families, and recompute all CSV files") |
1045 | print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.") | 997 | print("--from-scratch\t\t\tDelete database, local 3D and sequence files, and known issues, and recompute.") |
998 | + print("--archive\t\t\tCreate a tar.gz archive of the datapoints text files, and update the link to the latest archive") | ||
1046 | print() | 999 | print() |
1047 | print("Typical usage:") | 1000 | print("Typical usage:") |
1048 | - print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder /Data/RNA/sequences' &") | 1001 | + print(f"nohup bash -c 'time {runDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder /Data/RNA/sequences' -s --archive &") |
1049 | sys.exit() | 1002 | sys.exit() |
1050 | elif opt == '--version': | 1003 | elif opt == '--version': |
1051 | print("RNANet 1.0 alpha ") | 1004 | print("RNANet 1.0 alpha ") |
... | @@ -1102,6 +1055,8 @@ class Pipeline: | ... | @@ -1102,6 +1055,8 @@ class Pipeline: |
1102 | self.USE_KNOWN_ISSUES = False | 1055 | self.USE_KNOWN_ISSUES = False |
1103 | elif opt == "--extract": | 1056 | elif opt == "--extract": |
1104 | self.EXTRACT_CHAINS = True | 1057 | self.EXTRACT_CHAINS = True |
1058 | + elif opt == "--archive": | ||
1059 | + self.ARCHIVE = True | ||
1105 | 1060 | ||
1106 | if "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data]: | 1061 | if "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data]: |
1107 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") | 1062 | print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments") |
... | @@ -1420,12 +1375,13 @@ class Pipeline: | ... | @@ -1420,12 +1375,13 @@ class Pipeline: |
1420 | conn = sqlite3.connect(runDir+"/results/RNANet.db") | 1375 | conn = sqlite3.connect(runDir+"/results/RNANet.db") |
1421 | pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", | 1376 | pd.read_sql_query("SELECT rfam_acc, description, idty_percent, nb_homologs, nb_3d_chains, nb_total_homol, max_len, comput_time, comput_peak_mem from family ORDER BY nb_3d_chains DESC;", |
1422 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) | 1377 | conn).to_csv(runDir + f"/results/archive/families_{time_str}.csv", float_format="%.2f", index=False) |
1423 | - pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, reversed, date, exp_method, resolution, issue FROM structure | 1378 | + pd.read_sql_query("""SELECT structure_id, chain_name, pdb_start, pdb_end, rfam_acc, inferred, date, exp_method, resolution, issue FROM structure |
1424 | JOIN chain ON structure.pdb_id = chain.structure_id | 1379 | JOIN chain ON structure.pdb_id = chain.structure_id |
1425 | ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) | 1380 | ORDER BY structure_id, chain_name, rfam_acc ASC;""", conn).to_csv(runDir + f"/results/archive/summary_{time_str}.csv", float_format="%.2f", index=False) |
1426 | conn.close() | 1381 | conn.close() |
1427 | 1382 | ||
1428 | # Archive the results | 1383 | # Archive the results |
1384 | + if self.SELECT_ONLY is None: | ||
1429 | os.makedirs("results/archive", exist_ok=True) | 1385 | os.makedirs("results/archive", exist_ok=True) |
1430 | subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."]) | 1386 | subprocess.run(["tar","-C", path_to_3D_data + "/datapoints","-czf",f"results/archive/RNANET_datapoints_{time_str}.tar.gz","."]) |
1431 | 1387 | ||
... | @@ -1549,7 +1505,6 @@ def sql_define_tables(conn): | ... | @@ -1549,7 +1505,6 @@ def sql_define_tables(conn): |
1549 | chain_name VARCHAR(2) NOT NULL, | 1505 | chain_name VARCHAR(2) NOT NULL, |
1550 | pdb_start SMALLINT, | 1506 | pdb_start SMALLINT, |
1551 | pdb_end SMALLINT, | 1507 | pdb_end SMALLINT, |
1552 | - reversed TINYINT, | ||
1553 | issue TINYINT, | 1508 | issue TINYINT, |
1554 | rfam_acc CHAR(7), | 1509 | rfam_acc CHAR(7), |
1555 | inferred TINYINT, | 1510 | inferred TINYINT, |
... | @@ -1578,7 +1533,7 @@ def sql_define_tables(conn): | ... | @@ -1578,7 +1533,7 @@ def sql_define_tables(conn): |
1578 | CREATE TABLE IF NOT EXISTS nucleotide ( | 1533 | CREATE TABLE IF NOT EXISTS nucleotide ( |
1579 | chain_id INT, | 1534 | chain_id INT, |
1580 | index_chain SMALLINT, | 1535 | index_chain SMALLINT, |
1581 | - nt_resnum SMALLINT, | 1536 | + old_nt_resnum VARCHAR(5), |
1582 | nt_position SMALLINT, | 1537 | nt_position SMALLINT, |
1583 | nt_name VARCHAR(5), | 1538 | nt_name VARCHAR(5), |
1584 | nt_code CHAR(1), | 1539 | nt_code CHAR(1), |
... | @@ -2004,7 +1959,7 @@ def work_build_chain(c, extract, khetatm, retrying=False): | ... | @@ -2004,7 +1959,7 @@ def work_build_chain(c, extract, khetatm, retrying=False): |
2004 | 1959 | ||
2005 | # extract the portion we want | 1960 | # extract the portion we want |
2006 | if extract and not c.delete_me: | 1961 | if extract and not c.delete_me: |
2007 | - c.extract(khetatm) | 1962 | + c.extract(df, khetatm) |
2008 | 1963 | ||
2009 | return c | 1964 | return c |
2010 | 1965 | ||
... | @@ -2331,7 +2286,7 @@ def work_save(c, homology=True): | ... | @@ -2331,7 +2286,7 @@ def work_save(c, homology=True): |
2331 | conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) | 2286 | conn = sqlite3.connect(runDir + "/results/RNANet.db", timeout=15.0) |
2332 | if homology: | 2287 | if homology: |
2333 | df = pd.read_sql_query(f""" | 2288 | df = pd.read_sql_query(f""" |
2334 | - SELECT index_chain, nt_resnum, nt_position, nt_name, nt_code, nt_align_code, | 2289 | + SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, |
2335 | is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn, | 2290 | is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, dbn, |
2336 | paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, | 2291 | paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, |
2337 | chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, | 2292 | chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, |
... | @@ -2344,7 +2299,7 @@ def work_save(c, homology=True): | ... | @@ -2344,7 +2299,7 @@ def work_save(c, homology=True): |
2344 | filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.mapping.rfam_acc | 2299 | filename = path_to_3D_data + "datapoints/" + c.chain_label + '.' + c.mapping.rfam_acc |
2345 | else: | 2300 | else: |
2346 | df = pd.read_sql_query(f""" | 2301 | df = pd.read_sql_query(f""" |
2347 | - SELECT index_chain, nt_resnum, nt_position, nt_name, nt_code, nt_align_code, | 2302 | + SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, |
2348 | is_A, is_C, is_G, is_U, is_other, dbn, | 2303 | is_A, is_C, is_G, is_U, is_other, dbn, |
2349 | paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, | 2304 | paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, |
2350 | chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, | 2305 | chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base, |
... | @@ -2406,6 +2361,7 @@ if __name__ == "__main__": | ... | @@ -2406,6 +2361,7 @@ if __name__ == "__main__": |
2406 | print("Completed.") | 2361 | print("Completed.") |
2407 | exit(0) | 2362 | exit(0) |
2408 | 2363 | ||
2364 | + | ||
2409 | # At this point, structure, chain and nucleotide tables of the database are up to date. | 2365 | # At this point, structure, chain and nucleotide tables of the database are up to date. |
2410 | # (Modulo some statistics computed by statistics.py) | 2366 | # (Modulo some statistics computed by statistics.py) |
2411 | 2367 | ... | ... |
recompute_some_chains.py
0 → 100755
1 | +#!python3 | ||
2 | +import subprocess, os, sys | ||
3 | + | ||
4 | +# Put a list of problematic chains here, they will be properly deleted and recomputed | ||
5 | +problems = [ | ||
6 | +"4v7f_1_1_4-3396", | ||
7 | +"4v7f_1_1_1-3167", | ||
8 | +"4v7f_1_1_1-3255", | ||
9 | +"6g90_1_1_1-407", | ||
10 | +"3q3z_1_A_1-74", | ||
11 | +"3q3z_1_V_1-74", | ||
12 | +"4v7f_1_3_1-121" | ||
13 | +] | ||
14 | + | ||
15 | +path_to_3D_data = sys.argv[1] | ||
16 | +path_to_seq_data = sys.argv[2] | ||
17 | + | ||
18 | +for p in problems: | ||
19 | + print() | ||
20 | + print() | ||
21 | + print() | ||
22 | + print() | ||
23 | + | ||
24 | + # Remove the datapoints files and 3D files | ||
25 | + subprocess.run(["rm", '-f', path_to_3D_data + f"/rna_mapped_to_Rfam/{p}.cif"]) | ||
26 | + files = [ f for f in os.listdir(path_to_3D_data + "/datapoints") if p in f ] | ||
27 | + for f in files: | ||
28 | + subprocess.run(["rm", '-f', path_to_3D_data + f"/datapoints/{f}"]) | ||
29 | + | ||
30 | + # Find more information | ||
31 | + structure = p.split('_')[0] | ||
32 | + chain = p.split('_')[2] | ||
33 | + families = [ f.split('.')[1] for f in files ] # The RFAM families this chain has been mapped onto | ||
34 | + | ||
35 | + # Delete the chain from the database, and the associated nucleotides and re_mappings, using foreign keys | ||
36 | + for fam in families: | ||
37 | + command = ["sqlite3", "results/RNANet.db", f"PRAGMA foreign_keys=ON; delete from chain where structure_id=\"{structure}\" and chain_name=\"{chain}\" and rfam_acc=\"{fam}\";"] | ||
38 | + print(' '.join(command)) | ||
39 | + subprocess.run(command) | ||
40 | + | ||
41 | + # Re-run RNANet | ||
42 | + command = ["python3.8", "RNAnet.py", "--3d-folder", path_to_3D_data, "--seq-folder", path_to_seq_data, "-r", "20.0", "--extract", "--only", p] | ||
43 | + print('\n',' '.join(command),'\n') | ||
44 | + subprocess.run(command) | ||
45 | + | ||
46 | +# run statistics |
... | @@ -139,6 +139,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | ... | @@ -139,6 +139,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): |
139 | fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png") | 139 | fig.savefig(f"results/figures/wadley_plots/wadley_hist_{angle}_{l}.png") |
140 | if show: | 140 | if show: |
141 | fig.show() | 141 | fig.show() |
142 | + fig.close() | ||
142 | 143 | ||
143 | # Smoothed joint distribution | 144 | # Smoothed joint distribution |
144 | fig = plt.figure() | 145 | fig = plt.figure() |
... | @@ -149,6 +150,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | ... | @@ -149,6 +150,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): |
149 | fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png") | 150 | fig.savefig(f"results/figures/wadley_plots/wadley_distrib_{angle}_{l}.png") |
150 | if show: | 151 | if show: |
151 | fig.show() | 152 | fig.show() |
153 | + fig.close() | ||
152 | 154 | ||
153 | # 2D Wadley plot | 155 | # 2D Wadley plot |
154 | fig = plt.figure(figsize=(5,5)) | 156 | fig = plt.figure(figsize=(5,5)) |
... | @@ -161,6 +163,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): | ... | @@ -161,6 +163,7 @@ def reproduce_wadley_results(show=False, carbon=4, sd_range=(1,4)): |
161 | fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png") | 163 | fig.savefig(f"results/figures/wadley_plots/wadley_{angle}_{l}.png") |
162 | if show: | 164 | if show: |
163 | fig.show() | 165 | fig.show() |
166 | + fig.close() | ||
164 | # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.") | 167 | # print(f"[{worker_nbr}]\tComputed joint distribution of angles (C{carbon}) and saved the figures.") |
165 | 168 | ||
166 | def stats_len(): | 169 | def stats_len(): |
... | @@ -440,7 +443,7 @@ def to_dist_matrix(f): | ... | @@ -440,7 +443,7 @@ def to_dist_matrix(f): |
440 | idty = dm.get_distance(al).matrix # list of lists | 443 | idty = dm.get_distance(al).matrix # list of lists |
441 | del al | 444 | del al |
442 | l = len(idty) | 445 | l = len(idty) |
443 | - np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i] for i in range(l) ])) | 446 | + np.save("data/"+f+".npy", np.array([ idty[i] + [0]*(l-1-i) if i<l-1 else idty[i] for i in range(l) ], dtype=object)) |
444 | del idty | 447 | del idty |
445 | notify(f"Computed {f} distance matrix") | 448 | notify(f"Computed {f} distance matrix") |
446 | return 0 | 449 | return 0 | ... | ... |
-
Please register or login to post a comment