align_nt_stats moved to Chain class as method

Louis BECQUEY
Commit d5dc9e3a0aae5fa897f1ef4a5b0bca9bc286ac4e d5dc9e3a 1 parent f29ffed0
Showing 1 changed file with 278 additions and 169 deletions
RNAnet.py
--- a/RNAnet.py
View file @d5dc9e3
+++ b/RNAnet.py
View file @d5dc9e3
 #!/usr/bin/python3.8
 import numpy as np
 import pandas as pd
- import concurrent.futures, Bio.PDB.StructureBuilder, gzip, io, json, os, psutil, re, requests, sqlalchemy, subprocess, sys, time, warnings
+ import concurrent.futures, Bio.PDB.StructureBuilder, copy, gzip, io, json, os, psutil, re, requests, sqlalchemy, subprocess, sys, time, warnings
 from Bio import AlignIO, SeqIO
 from Bio.PDB import MMCIFParser
 from Bio.PDB.mmcifio import MMCIFIO
@@ -105,17 +105,23 @@ class Chain:
         self.full_mmCIFpath = ""                # path to the source mmCIF structure
         self.file = ""                          # path to the 3D PDB file
         self.rfam_fam = ""                      # mapping to an RNA family
-         self.seq = ""                           # sequence
+         self.seq = ""                           # sequence with modified nts
+         self.aligned_seq = ""                   # sequence with modified nts replaced, but gaps can exist
         self.length = -1                        # length of the sequence (missing residues are not counted)
         self.full_length = -1                   # length of the chain extracted from source structure ([start; stop] interval)
         self.delete_me = False                  # an error occured during production/parsing
+         self.error_messages = ""                # Error message(s) if any
         self.frequencies = np.zeros((5,0))      # frequencies of nt at every position: A,C,G,U,Other
-         self.etas = []                          # eta' pseudotorsion at every position, if available
-         self.thetas = []                        # theta' pseudotorsion at every position, if available
-         self.mask = []                          # nucleotide is available in 3D (1 or 0), for every position
+         self.data3D = None                      # Pandas DataFrame with all the 3D data extracted by DSSR.
 
     def __str__(self):
         return self.pdb_id + '[' + str(self.pdb_model) + "]-" + self.pdb_chain_id
+     
+     def __eq__(self, other):
+         return self.chain_label == other.chain_label and str(self) == str(other)
+ 
+     def __hash__(self):
+         return hash((self.pdb_id, self.pdb_model, self.pdb_chain_id, self.chain_label))
 
     def download_3D(self):
         """ Look for the main CIF file (with all chains) from RCSB
@@ -140,6 +146,7 @@ class Chain:
         except IOError:
             print(status + f"\tERR \U0000274E\t\033[31mError downloading {url} !\033[0m")
             self.delete_me = True
+             self.error_messages = f"Error downloading {url}"
 
     def extract_portion(self, filename, pdb_start, pdb_end):
         """ Extract the part which is mapped to Rfam from the main CIF file and save it to another file.
@@ -166,6 +173,7 @@ class Chain:
             if self.full_mmCIFpath == "":
                 print(status + f"\t\U0000274E\t\033[31mError with CIF file of {self.pdb_id} !\033[0m", flush=True)
                 self.delete_me = True
+                 self.error_messages = f"Error with CIF file of {self.pdb_id}"
                 return
 
             # Load the whole mmCIF into a Biopython structure object:
@@ -199,12 +207,10 @@ class Chain:
         """
 
         self.rfam_fam = rfam
-         print("\t> Associating it to", rfam, f"...\t\t\t{validsymb}")
 
     def extract_3D_data(self):
-         """ Runs DSSR to annotate the 3D chain and get various information about it.
-         """
-         
+         """ Runs DSSR to annotate the 3D chain and get various information about it. """
+ 
         # Check if the file exists. If no, compute it.
         if not os.path.exists(path_to_3D_data+f"pseudotorsions/{self.chain_label}.csv"):
 
@@ -220,6 +226,7 @@ class Chain:
                     # DSSR is unable to parse the chain.
                     warn(f"Exception while running DSSR: {stderr}\n\tIgnoring {self.chain_label}.\t\t\t", error=True)
                     self.delete_me = True
+                     self.error_messages = f"Exception while running DSSR for {self.chain_label}:\n {stderr}"
                     return
 
                 # Get the JSON from DSSR output 
@@ -229,86 +236,255 @@ class Chain:
                 if "warning" in json_object.keys():
                     warn(f"Ignoring {self.chain_label} ({json_object['warning']})\t", error=True)
                     self.delete_me = True
+                     self.error_messages = f"DSSR warning for {self.chain_label}: {json_object['warning']}"
                     return
 
-                 # Extract the part about nucleotides from the DSSR annotation
+                 # Extract the interesting parts
                 nts = json_object["nts"]
 
+                 # Prepare a data structure (Pandas DataFrame)
+                 resnum_start = int(nts[0]["nt_resnum"])
+                 df = pd.DataFrame(nts)
+                 # remove low pertinence or undocumented descriptors
+                 df = df.drop(['summary', 'chain_name', 'index',
+                                'v0', 'v1', 'v2', 'v3', 'v4', 'splay_angle',
+                                'splay_distance', 'splay_ratio', 'sugar_class',
+                                'amplitude', 'phase_angle'], axis=1)
+                 df['P_x'] = [ float(i[0]) if i[0] is not None else np.NaN for i in df['P_xyz'] ]                #
+                 df['P_y'] = [ float(i[1]) if i[1] is not None else np.NaN for i in df['P_xyz'] ]                #
+                 df['P_z'] = [ float(i[2]) if i[2] is not None else np.NaN for i in df['P_xyz'] ]                # Flatten the 
+                 df['C5prime_x'] = [ float(i[0]) if i[0] is not None else np.NaN for i in df['C5prime_xyz'] ]    # Python dictionary
+                 df['C5prime_y'] = [ float(i[1]) if i[1] is not None else np.NaN for i in df['C5prime_xyz'] ]    #
+                 df['C5prime_z'] = [ float(i[2]) if i[2] is not None else np.NaN for i in df['C5prime_xyz'] ]    #
+ 
+                 # Add a sequence column just for the alignments
+                 df['nt_align_code'] = [ str(x).upper()
+                                               .replace('NAN', '-')      # Unresolved nucleotides are gaps
+                                               .replace('?', '-')         # Unidentified residues, let's delete them
+                                               .replace('T', 'U')        # 5MU are modified to t, which gives T
+                                               .replace('P', 'U')        # Pseudo-uridines, but it is not really right to change them to U, see DSSR paper, Fig 2
+                                         for x in df['nt_code'] ]
+ 
+                 # Shift numbering when duplicate residue numbers are found.
+                 # Example: 4v9q-DV contains 17 and 17A which are both read 17 by DSSR.
+                 while True in df.duplicated(['nt_resnum']).values:
+                     i = df.duplicated(['nt_resnum']).values.tolist().index(True)
+                     df.iloc[i:, 1] += 1
+                 l = df.iloc[-1,1] - df.iloc[0,1] + 1
+ 
+                 # Add eventual missing rows because of unsolved residues in the chain:
+                 if l != len(df['index_chain']):
+                     # We have some rows to add. First, identify them:
+                     diff = set(range(l)).difference(df['nt_resnum'] - resnum_start)
+ 
+                     for i in sorted(diff):
+                         df = pd.concat([df.iloc[:i-1], pd.DataFrame({"index_chain": i, "nt_resnum": i+resnum_start-1, 
+                                                                      "nt_code":'-', "nt_name":'-', 'nt_align_code':'-'}, index=[i-1]), df.iloc[i-1:]])
+                         df.iloc[i:, 0] += 1
+                     df = df.reset_index(drop=True)
+ 
+ 
+                 # Iterate over pairs to identify base-base interactions
+                 res_ids = list(df['nt_id'])
+                 paired = [ 0 ] * l
+                 pair_type_LW = [ '' ] * l
+                 pair_type_DSSR = [ '' ] * l
+                 interacts = [ 0 ] * l
+                 if "pairs" in json_object.keys():
+                     pairs = json_object["pairs"]
+                     for p in pairs:
+                         nt1 = p["nt1"]
+                         nt2 = p["nt2"]
+                         if nt1 in res_ids and nt2 in res_ids:
+                             nt1_idx = res_ids.index(nt1)
+                             nt2_idx = res_ids.index(nt2)
+                             paired[nt1_idx] = nt2_idx + 1
+                             paired[nt2_idx] = nt1_idx + 1
+                             interacts[nt1_idx] += 1
+                             interacts[nt2_idx] += 1
+                             pair_type_LW[nt1_idx] = p["LW"]
+                             pair_type_LW[nt2_idx] = p["LW"]
+                             pair_type_DSSR[nt1_idx] = p["DSSR"]
+                             pair_type_DSSR[nt2_idx] = p["DSSR"]
+                         elif nt1 in res_ids:
+                             nt1_idx = res_ids.index(nt1)
+                             interacts[nt1_idx] += 1
+                         elif nt2 in res_ids:
+                             nt2_idx = res_ids.index(nt2)
+                             interacts[nt2_idx] += 1
+                 df['paired'] = paired
+                 df['pair_type_LW'] = pair_type_LW
+                 df['pair_type_DSSR'] = pair_type_DSSR
+ 
+                 # Iterate over multiplets to identify base-base interactions
+                 if "multiplets" in json_object.keys():
+                     multiplets = json_object["multiplets"]
+                     for m in multiplets:
+                         nts = m["nts_long"].split(',')
+                         # iterate  over the nts of a multiplet
+                         for j, nt in enumerate(nts):
+ 
+                             # if the nt is in that chain:
+                             if nt in res_ids:
+                                 i = res_ids.index(nt)
+                                 # iterate over those other nts
+                                 for o in nts[:j]+nts[j+1:]:
+                                     if o in res_ids and str(res_ids.index(o)+1) not in str(df['paired'][i]): # and it's not already in 'paired'
+                                         df.loc[i,'paired'] = str(df['paired'][i]) + ',' + str(res_ids.index(o)+1)
+                                 interacts[i] = len(str(df['paired'][i]).split(','))
+                             
+                 df['Ninteract'] = interacts
+ 
+                 df = df.drop(['C5prime_xyz', 'P_xyz', 'nt_id'], axis=1) # remove now useless descriptors
+ 
+                 if self.reversed:
+                     # The 3D structure is numbered from 3' to 5' instead of standard 5' to 3'
+                     # or the sequence that matches the Rfam family is 3' to 5' instead of standard 5' to 3'.
+                     # Anyways, you need to invert the angles.
+                     warn(f"Has {self.chain_label} been numbered from 3' to 5' ? Inverting pseudotorsions, other angle measures are not corrected.")
+                     df = df.reindex(index=df.index[::-1]).reset_index(drop=True)
+                     df['index_chain'] = 1 + df.index
+                     temp_eta = df['eta']
+                     df['eta'] = [ df['theta'][n] for n in range(l) ]              # eta(n)    = theta(l-n+1) forall n in ]1, l] 
+                     df['theta'] = [ temp_eta[n] for n in range(l) ]               # theta(n)  = eta(l-n+1)   forall n in [1, l[ 
+                     temp_eta = df['eta_prime']
+                     df['eta_prime'] = [ df['theta_prime'][n] for n in range(l) ]  # eta(n)    = theta(l-n+1) forall n in ]1, l] 
+                     df['theta_prime'] = [ temp_eta[n] for n in range(l) ]         # theta(n)  = eta(l-n+1)   forall n in [1, l[ 
+                     temp_eta = df['eta_base']
+                     df['eta_base'] = [ df['theta_base'][n] for n in range(l) ]    # eta(n)    = theta(l-n+1) forall n in ]1, l] 
+                     df['theta_base'] = [ temp_eta[n] for n in range(l) ]          # theta(n)  = eta(l-n+1)   forall n in [1, l[ 
+                     newpairs = []
+                     for v in df['paired']:
+                         if ',' in v:
+                             temp_v = []
+                             vs = v.split(',')
+                             for _ in vs:
+                                 temp_v.append(str(l-int(_)+1))
+                             newpairs.append(','.join(temp_v))
+                         else:
+                             if int(v):
+                                 newpairs.append(str(l-int(v)+1))
+                     df['paired'] = newpairs
             except KeyError as e:
                 # Mostly, there are no part about nucleotides in the DSSR output. Abort.
-                 warn(f"Error while parsing DSSR's json output:\n{json_object.keys()}\n{json_object}\n\tignoring {self.chain_label}\t\t\t\t", error=True)
+                 warn(f"Error while parsing DSSR's json output:\n{e}\n\tignoring {self.chain_label}\t\t\t\t", error=True)
                 self.delete_me = True
+                 self.error_messages = f"Error while parsing DSSR's json output:\n{e}"
                 return
- 
-             print("\t> Computing", self.chain_label, f"pseudotorsions...\t\t{validsymb}", flush=True)
-             
-             # Prepare data structures to extract the pseudotorisions
-             l = int(nts[-1]["nt_resnum"]) - int(nts[0]["nt_resnum"]) + 1
-             eta = [np.NaN] * l
-             theta = [np.NaN] * l
-             mask = [ 0 ] * l    # nts that are not resolved will have a mask at 0
-             seq = [ "-" ] * l   # ... and will be marked "-" in the sequence.
- 
-             # the residue number of the chain start (which has been previously extracted from a whole mmCIF file)
-             resnum_start = int(nts[0]["nt_resnum"])
- 
-             for nt in nts:
-                 if nt["eta_prime"] is None:
-                     e = np.NaN
-                 else:
-                     e = float(nt["eta_prime"])
-                 if nt["theta_prime"] is None:
-                     t = np.NaN
-                 else:
-                     t = float(nt["theta_prime"])
-                 p = int(nt["nt_resnum"]) - resnum_start   # index of the nt in the chain
-                 
-                 # All non-regular-ACGU nucleotides (incl. modified & missing) are masked to 0.
-                 # They will be replaced by the consensus in the alignment.
-                 # U is a U, u is a modified U and should be replaced.
-                 mask[p]  = int(nt["nt_code"] in "ACGU")
- 
-                 # seq will be later updated with consensus for all positions with mask=0
-                 seq[p]   = nt["nt_code"].upper().replace('?','-').replace('P','U').replace('T','U')
- 
-                 # pseudotorsions
-                 eta[p]   = e
-                 theta[p] = t
- 
-             if self.reversed:
-                 # The 3D structure is numbered from 3' to 5' instead of standard 5' to 3'
-                 # or the sequence that matches the Rfam family is 3' to 5' instead of standard 5' to 3'.
-                 # Anyways, you need to invert the angles.
-                 warn(f"Has {self.chain_label} been numbered from 3' to 5' ? Inverting angles.")
-                 seq = seq[::-1]
-                 mask = mask[::-1]
-                 temp_eta = [ e for e in eta ]
-                 eta = [ theta[n] for n in range(l) ]        # eta(n)    = theta(l-n+1) forall n in ]1, l] 
-                 theta = [ temp_eta[n] for n in range(l) ]   # theta(n)  = eta(l-n+1)   forall n in [1, l[ 
             
             # Creating a df for easy saving to CSV
-             pd.DataFrame({"seq": list(seq), "m": list(mask), "eta": list(eta), "theta": list(theta)}
-                         ).to_csv(path_to_3D_data+f"pseudotorsions/{self.chain_label}.csv")
+             df.to_csv(path_to_3D_data + f"pseudotorsions/{self.chain_label}.csv")
+             del df
             print("\t> Saved", self.chain_label, f"pseudotorsions to CSV.\t\t{validsymb}", flush=True)
         else:
             print("\t> Computing", self.chain_label, f"pseudotorsions...\t{validsymb}\t(already done)", flush=True)
 
         # Now load data from the CSV file
-         d = pd.read_csv(path_to_3D_data+f"pseudotorsions/{self.chain_label}.csv")
-         self.seq = "".join(d.seq.values)
-         self.length = len([ x for x in d.seq if x != "-" ])
-         self.full_length = len(d.seq)
-         self.mask = "".join([ str(int(x)) for x in d.m.values])
-         self.etas = d.eta.values
-         self.thetas = d.theta.values
+         d = pd.read_csv(path_to_3D_data+f"pseudotorsions/{self.chain_label}.csv", index_col=0)
+         self.seq = "".join(d.nt_code.values)
+         self.aligned_seq = "".join(d.nt_align_code.values)
+         self.length = len([ x for x in self.aligned_seq if x != "-" ])
+         self.full_length = len(d.nt_code)
+         self.data3D = d
         print(f"\t> Loaded data from CSV\t\t\t\t{validsymb}", flush=True)
 
         # Remove too short chains
         if self.length < 5:
             warn(f"{self.chain_label} sequence is too short, let's ignore it.\t", error=True)
             self.delete_me = True
+             self.error_messages = "Sequence is too short. (< 5 resolved nts)"
         return
 
+     def set_freqs_from_aln(self, s_seq, freqs):
+         """Maps the object's sequence to its version in a MSA, to compute nucleotide frequencies at every position.
+         
+         s_seq: the aligned version of self.aligned_seq
+         freqs: the nucleotide frequencies at every position of s_seq
+         This also replaces gaps by the most common nucleotide.
+         """
+         alilen = len(s_seq)
+ 
+         # Save colums in the appropriate positions
+         i = 0
+         j = 0
+         while i<self.full_length and j<alilen:
+             # Here we try to map self.aligned_seq (the sequence of the 3D chain, including gaps when residues are missing), 
+             # with s_seq, the sequence aligned in the MSA, containing any of ACGU and two types of gaps, - and .
+ 
+             if self.aligned_seq[i] == s_seq[j].upper(): # alignment and sequence correspond (incl. gaps)
+                 self.frequencies = np.concatenate((self.frequencies, freqs[:,j].reshape(-1,1)), axis=1)
+                 i += 1
+                 j += 1
+             elif self.aligned_seq[i] == '-': # gap in the chain, but not in the aligned sequence
+ 
+                 # search for a gap to the consensus nearby
+                 k = 0
+                 while j+k<alilen and s_seq[j+k] in ['.','-']:
+                     if s_seq[j+k] == '-':
+                         break
+                     k += 1
+ 
+                 # if found, set j to that position
+                 if j+k<alilen and s_seq[j+k] == '-':
+                     j = j + k
+                     continue
+ 
+                 # if not, search for a insertion gap nearby
+                 if j<alilen and s_seq[j] == '.':
+                     self.frequencies = np.concatenate((self.frequencies, freqs[:,j].reshape(-1,1)), axis=1)
+                     i += 1
+                     j += 1
+                     continue
+ 
+                 # else, just ignore the gap.
+                 self.frequencies = np.concatenate((self.frequencies, np.array([0.0,0.0,0.0,0.0,1.0]).reshape(-1,1)), axis=1)
+                 i += 1
+             elif s_seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
+                 j += 1 # ignore the column
+             else: # sequence mismatch which is not a gap...
+                 print(f"You are never supposed to reach this. Comparing {self.chain_label} in {i} ({self.aligned_seq[i-1:i+2]}) with seq[{j}] ({s_seq[j-3:j+4]}).\n", 
+                         self.aligned_seq, 
+                         sep='', flush=True)
+                 exit(1)
+ 
+         # Replace gapped positions by the consensus sequence:
+         c_aligned_seq = list(self.aligned_seq)
+         c_seq = list(self.seq)
+         letters = ['A', 'C', 'G', 'U', 'N']
+         for i in range(self.full_length):
+             if c_aligned_seq[i] == '-':      # (then c_seq[i] also is)
+                 freq = self.frequencies[:,i]
+                 l = letters[freq.tolist().index(max(freq))]
+                 c_aligned_seq[i] = l
+                 c_seq[i] = l
+                 self.data3D.iloc[i,3] = l # self.data3D['nt_code'][i]
+         self.aligned_seq = ''.join(c_aligned_seq)
+         self.seq = ''.join(c_seq)
+ 
+         # Temporary np array to store the computations
+         point = np.zeros((11, self.full_length))
+         for i in range(self.full_length):
+             # normalized position in the chain
+             point[0,i] = float(i+1)/self.full_length 
+ 
+             # one-hot encoding of the actual sequence
+             if self.seq[i] in letters[:4]:
+                 point[ 1 + letters[:4].index(self.seq[i]), i ] = 1
+             else:
+                 point[5,i] = 1
+ 
+             # PSSMs
+             point[6,i] = self.frequencies[0, i]
+             point[7,i] = self.frequencies[1, i]
+             point[8,i] = self.frequencies[2, i]
+             point[9,i] = self.frequencies[3, i]
+             point[10,i] = self.frequencies[4, i]
+         
+         self.data3D = pd.concat([self.data3D, pd.DataFrame(point.T, columns=["position","is_A","is_C","is_G","is_U","is_other","freq_A","freq_C","freq_G","freq_U","freq_other"])], axis=1)
+ 
+         # save to file
+         self.data3D.to_csv(path_to_3D_data + "datapoints/" + self.chain_label)
+ 
 
 class Job:
     """ This class contains information about a task to run later.
@@ -885,7 +1061,6 @@ def build_chain(c, rfam, pdb_start, pdb_end):
 
     # If no problems, map it to an Rfam family, and annotate it with DSSR
     if not c.delete_me:
-         c.set_rfam(rfam)
         c.extract_3D_data()
 
     # If there were newly discovered problems, add this chain to the known issues
@@ -894,6 +1069,9 @@ def build_chain(c, rfam, pdb_start, pdb_end):
         f = open(path_to_3D_data + "known_issues.txt", 'a')
         f.write(c.chain_label + '\n')
         f.close()
+         f = open(path_to_3D_data + "known_issues_reasons.txt", 'a')
+         f.write(c.chain_label + '\n' + c.error_messages + '\n\n')
+         f.close()
     
     # The Chain object is ready
     return c
@@ -927,8 +1105,7 @@ def cm_realign(rfam_acc, chains, label):
 
         # Add the chains sequences to the file
         for c in chains:
-             seq_str = c.seq.replace('U','T').replace('-','').replace('P','U') # We align as DNA, and we replace pseudo-uridines by uridines.
-             f.write(f"> {str(c)}\n"+seq_str+'\n') 
+             f.write(f"> {str(c)}\n"+c.aligned_seq.replace('-', '').replace('U','T')+'\n') 
 
         f.close()
 
@@ -1008,20 +1185,18 @@ def summarize_position(col):
     """
 
     # Count the different chars in the column
-     counts = { 'A': col.count('A') + col.count('a'),  # caps are nt in conformity with the consensus (?)
-                'C':col.count('C') + col.count('c'), 
-                'G':col.count('G') + col.count('g'), 
-                'U':col.count('U') + col.count('u'), 
-                '-':col.count('-'), '.':col.count('.')}
+     counts = { 'A':col.count('A'), 'C':col.count('C'), 
+                'G':col.count('G'), 'U':col.count('U'), 
+                '-':col.count('-'), '.':col.count('.') }
 
-     # Compute how many are regular ACGU and how many are not
+     # Count modified nucleotides
     known_chars_count = 0
-     chars = ''.join(set(col))
+     chars = set(col)
     for char in chars:
         if char in "ACGU":
             known_chars_count += counts[char]
-         elif char not in "-.acgu":
-             counts[char] = col.count(char)
+         # elif char not in "-.":
+             # counts[char] = col.count(char)
     N = len(col) - counts['-'] - counts['.'] # number of ungapped residues
 
     if N: # prevent division by zero if the column is only gaps
@@ -1033,6 +1208,7 @@ def alignment_nt_stats(f):
     """ Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
     
     Also saves every chain of the family to file.
+     Uses only 1 core, so this function can be called in parallel.
     """
 
     # Get a worker number to position the progress bar
@@ -1048,7 +1224,7 @@ def alignment_nt_stats(f):
         align = AlignIO.read(path_to_seq_data + f"realigned/{f}++.afa", "fasta")
         alilen = align.get_alignment_length()
     except:
-         warn(f,"alignment is wrong. Recompute it and retry.", error=True)
+         warn(f"{f}'s alignment is wrong. Recompute it and retry.", error=True)
         exit(1)
 
     # Compute statistics per column
@@ -1058,93 +1234,18 @@ def alignment_nt_stats(f):
     frequencies = np.array(results).T
 
     # For each sequence, find the right chain and save the PSSMs inside.
+     pbar = tqdm(total=len(chains_ids), position=thr_idx+1, desc=f"Worker {thr_idx+1}: {f} chains", leave=False)
+     pbar.update(0)
     for s in align:
         if not '[' in s.id: # this is a Rfamseq entry, not a 3D chain
             continue
 
         # get the right 3D chain:
         idx = chains_ids.index(s.id)
-         c = list_of_chains[idx]
- 
-         # Save colums in the appropriate positions
-         i = 0
-         j = 0
-         #warn_gaps = False
-         while i<c.full_length and j<alilen:
-             # Here we try to map c.seq (the sequence of the 3D chain, including gaps when residues are missing), 
-             # with s.seq, the sequence aligned in the MSA, containing any of ACGUacgu and two types of gaps, - and .
- 
-             if c.seq[i] == s.seq[j].upper(): # alignment and sequence correspond (incl. gaps)
-                 list_of_chains[idx].frequencies = np.concatenate((list_of_chains[idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1)
-                 i += 1
-                 j += 1
-             elif c.seq[i] == '-': # gap in the chain, but not in the aligned sequence
- 
-                 # search for a gap to the consensus nearby
-                 k = 0
-                 while j+k<alilen and s.seq[j+k] in ['.','-']:
-                     if s.seq[j+k] == '-':
-                         break
-                     k += 1
- 
-                 # if found, set j to that position
-                 if j+k<alilen and s.seq[j+k] == '-':
-                     j = j + k
-                     continue
- 
-                 # if not, search for a insertion gap nearby
-                 if j<alilen and s.seq[j] == '.':
-                     list_of_chains[idx].frequencies = np.concatenate((list_of_chains[idx].frequencies, frequencies[:,j].reshape(-1,1)), axis=1)
-                     i += 1
-                     j += 1
-                     continue
- 
-                 # else, just ignore the gap.
-                 #warn_gaps = True
-                 list_of_chains[idx].frequencies = np.concatenate((list_of_chains[idx].frequencies, np.array([0.0,0.0,0.0,0.0,1.0]).reshape(-1,1)), axis=1)
-                 i += 1
-             elif s.seq[j] in ['.', '-']: # gap in the alignment, but not in the real chain
-                 j += 1 # ignore the column
-             else:
-                 print(f"You are never supposed to reach this. Comparing {c.chain_label} in {i} ({c.seq[i-1:i+2]}) with seq[{j}] ({s.seq[j-3:j+4]}).\n", c.seq, '\n', s.seq, sep='', flush=True)
-                 exit(1)
-         #if warn_gaps:
-             #warn(f"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them.")
- 
-         # Replace masked positions by the consensus sequence:
-         c_seq = c.seq.split()
-         letters = ['A', 'C', 'G', 'U', 'N']
-         for i in range(c.full_length):
-             if not c.mask[i]:
-                 freq = list_of_chains[idx].frequencies[:,i]
-                 c_seq[i] = letters[freq.tolist().index(max(freq))]
-         list_of_chains[idx].seq = ''.join(c_seq)
- 
-         # Saving 'final' datapoint
-         c = list_of_chains[idx] # update the local object
-         point = np.zeros((13, c.full_length))
-         for i in range(c.full_length):
-             point[0,i] = i+1 # position
- 
-             # one-hot encoding of the actual sequence
-             if c.seq[i] in letters[:4]:
-                 point[ 1 + letters[:4].index(c.seq[i]), i ] = 1
-             else:
-                 point[5,i] = 1
+         list_of_chains[idx].set_freqs_from_aln(s.seq, frequencies)
+         pbar.update(1)
+     pbar.close()
 
-             # save the PSSMs
-             point[6,i] = c.frequencies[0, i]
-             point[7,i] = c.frequencies[1, i]
-             point[8,i] = c.frequencies[2, i]
-             point[9,i] = c.frequencies[3, i]
-             point[10,i] = c.frequencies[4, i]
-             point[11,i] = c.etas[i]
-             point[12,i] = c.thetas[i]
-         file = open(path_to_3D_data + "datapoints/" + c.chain_label, "w") # no check, always overwrite, this is desired
-         for i in range(13):
-             line = [str(x) for x in list(point[i,:]) ]
-             file.write(','.join(line)+'\n')
-         file.close()
 
     idxQueue.put(thr_idx) # replace the thread index in the queue
     return 0
@@ -1161,7 +1262,7 @@ if __name__ == "__main__":
     # ===========================================================================
 
     # List all 3D RNA chains below 4Ang resolution
-     all_chains = download_BGSU_NR_list()
+     all_chains = set(download_BGSU_NR_list())
 
     # Ask Rfam if some are mapped to Rfam families
     mappings = download_Rfam_PDB_mappings()
@@ -1170,8 +1271,13 @@ if __name__ == "__main__":
     chains_with_mapping = []
     for c in all_chains:
         mapping = mappings.loc[ (mappings.pdb_id == c.pdb_id) & (mappings.chain == c.pdb_chain_id) ]
-         if len(mapping.rfam_acc.values):
-             chains_with_mapping.append(c)
+         n = len(mapping.rfam_acc.values)
+         for j in range(n):
+             if j == n-1:
+                 chains_with_mapping.append(c)
+             else:
+                 chains_with_mapping.append(copy.deepcopy(c))
+             chains_with_mapping[-1].set_rfam(mapping.rfam_acc.values[j])
     n_chains = len(chains_with_mapping)
 
     # ===========================================================================
@@ -1196,14 +1302,16 @@ if __name__ == "__main__":
     for c in chains_with_mapping:
 
         # read mappings information
-         mapping = mappings.loc[ (mappings.pdb_id == c.pdb_id) & (mappings.chain == c.pdb_chain_id) ]
+         mapping = mappings.loc[ (mappings.pdb_id == c.pdb_id) & (mappings.chain == c.pdb_chain_id) & (mappings.rfam_acc == c.rfam_fam) ]
         pdb_start = str(mapping.pdb_start.values[0])
         pdb_end = str(mapping.pdb_end.values[0])
 
         # Add a job to build the chain to the list
         c.chain_label = f"{c.pdb_id}_{str(c.pdb_model)}_{c.pdb_chain_id}_{pdb_start}-{pdb_end}"
+         ncores = read_cpu_number()
         if c.chain_label not in known_issues:
             joblist.append(Job(function=build_chain,  # Apply function build_chain to every c.chain_label
+                                how_many_in_parallel=ncores,
                                args=[c, mapping.rfam_acc.values[0], pdb_start, pdb_end]))
 
     # Prepare the results folders
@@ -1211,7 +1319,7 @@ if __name__ == "__main__":
         os.makedirs(path_to_3D_data + "RNAcifs")    # for the whole structures
     if not path.isdir(path_to_3D_data + "rna_mapped_to_Rfam"):
         os.makedirs(path_to_3D_data + "rna_mapped_to_Rfam") # for the portions mapped to Rfam
-     if not os.isdir(path_to_3D_data+"pseudotorsions/"):
+     if not path.isdir(path_to_3D_data+"pseudotorsions/"):
         os.makedirs(path_to_3D_data+"pseudotorsions/") # for the annotations by DSSR
 
     # Run the builds and extractions
@@ -1229,7 +1337,7 @@ if __name__ == "__main__":
     # Preparing a results folder
     if not os.access(path_to_seq_data + "realigned/", os.F_OK):
         os.makedirs(path_to_seq_data + "realigned/")
-         
+ 
     # Get the list of Rfam families found
     rfam_acc_to_download = {}
     for c in loaded_chains:
@@ -1287,11 +1395,12 @@ if __name__ == "__main__":
     # Prepare the architecture of a shiny multi-progress-bars design
     thr_idx_mgr = Manager()                 # Push the number of workers to a queue. 
     idxQueue = thr_idx_mgr.Queue()          # ... Then each Pool worker will
-     for i in range(read_cpu_number()):      # ... pick a number from the queue when starting the computation for one family, 
+     for i in range(ncores):                 # ... pick a number from the queue when starting the computation for one family, 
         idxQueue.put(i)                     # ... and replace it when the computation has ended so it could be picked up later.
 
-     # Start a process pool to dispatch the RNA families over multiple CPUs
-     p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=read_cpu_number())
+     # Start a process pool to dispatch the RNA families,
+     # over multiple CPUs (one family by CPU)
+     p = Pool(initializer=tqdm.set_lock, initargs=(tqdm.get_lock(),), processes=ncores)
 
     fam_pbar = tqdm(total=len(fam_list), desc="RNA families", position=0, leave=True) 
     for i, _ in enumerate(p.imap_unordered(alignment_nt_stats, fam_list)): # Apply alignment_nt_stats to each RNA family
@@ -1304,4 +1413,4 @@ if __name__ == "__main__":
     print("Completed.")  # This part of the code is supposed to release some serotonin in the modeller's brain
 
     # # so i can sleep for the end of the night
-     # subprocess.run(["shutdown","now"]) 
\ No newline at end of file
+     # subprocess.run(["shutdown","now"])