Merged Khodor-internship with ahead master

Louis BECQUEY
Commit fa394f4accc130a6c68c15f60f986eb2388daa10 fa394f4a 2 parents a3e754e5 1eff7ed5
Showing 2 changed files with 252 additions and 138 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @fa394f4
+++ b/RNAnet.py
View file @fa394f4
@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
 from Bio.Seq import MutableSeq
 from Bio.SeqRecord import SeqRecord
 from Bio.Align import MultipleSeqAlignment
- 
+ from pydca.plmdca import plmdca
 runDir = os.getcwd()
 
 def trace_unhandled_exceptions(func):
@@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2]) 
 validsymb = '\U00002705'
 warnsymb = '\U000026A0'
 errsymb = '\U0000274C'
- 
 LSU_set = {"RF00002", "RF02540", "RF02541",
            "RF02543", "RF02546"}   # From Rfam CLAN 00112
 SSU_set = {"RF00177", "RF02542",  "RF02545",
            "RF01959", "RF01960"}  # From Rfam CLAN 00111
+ 
 no_nts_set = set()
 weird_mappings = set()
 
@@ -845,14 +845,14 @@ class Downloader:
             if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
                 print("\t> Use of the previous version.\t", end="", flush=True)
             else:
-                 return pd.DataFrame([], columns=["class", "class_members"])
+                 return pd.DataFrame([], columns=["class","representative","class_members"])
 
         nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
-         full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ]
+         full_structures_list = [ tuple(i[1]) for i in nrlist[["class","representative","class_members"]].iterrows() ]
         print(f"\t{validsymb}", flush=True)
 
         # The beginning of an adventure.
-         return full_structures_list     # list of ( str (class), str (class_members) )
+         return full_structures_list     # list of ( str (class), str(representative),str (class_members) )
 
     def download_from_SILVA(self, unit):
 
@@ -966,6 +966,9 @@ class Pipeline:
         self.RUN_STATS = False
         self.EXTRACT_CHAINS = False
         self.REUSE_ALL = False
+         self.REDUNDANT = False
+         self.ALIGNOPTS = None
+         self.USESINA = False
         self.SELECT_ONLY = None
         self.ARCHIVE = False
         self.SAVELOGS = True
@@ -981,8 +984,9 @@ class Pipeline:
         setproctitle("RNANet.py process_options()")
 
         try:
-             opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=",  "only=", "maxcores=",
-                                                             "from-scratch", "full-inference", "no-homology", "ignore-issues", "extract", 
+             opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", 
+                                                             "only=", "cmalign-opts=", "maxcores=", "sina", "from-scratch", 
+                                                             "full-inference", "no-homology", "redundant", "ignore-issues", "extract", 
                                                             "all", "no-logs", "archive", "update-homologous", "version"])
         except getopt.GetoptError as err:
             print(err)
@@ -1006,6 +1010,7 @@ class Pipeline:
                 print("--------------------------------------------------------------------------------------------------------------")
                 print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of"
                       "\n\t\t\t\t chains mapped to different families.")
+                 print("--redundant\t\t\t\tStore the class members in the database thoughts to be redundant for predictions.")
                 print("-s\t\t\t\tRun statistics computations after completion")
                 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
                 print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
@@ -1022,9 +1027,12 @@ class Pipeline:
                 print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
                       "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
                       "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
+                 print("--sina\t\t\tForce the RNANet to align large subunit LSU and small subunit SSU ribosomal RNA using sina instead of infernal,"
+                 "\n\t\t\t\t\t the other RNA families will be aligned using infernal.")
                 print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous"
                       "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions"
                       "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.")
+                 print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"")
                 print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments,"
                       "\n\t\t\t\t and update the link to the latest archive. ")
                 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
@@ -1042,8 +1050,8 @@ class Pipeline:
                 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &")
                 sys.exit()
             elif opt == '--version':
-                 print("RNANet v1.3 beta, parallelized, Dockerized")
-                 print("Last revision : Jan 2021")
+                 print("RNANet v1.4 beta, parallelized, Dockerized")
+                 print("Last revision : April 2021")
                 sys.exit()
             elif opt == "-r" or opt == "--resolution":
                 assert float(arg) > 0.0 and float(arg) <= 20.0
@@ -1089,6 +1097,8 @@ class Pipeline:
                                 path_to_seq_data + "realigned",
                                 path_to_seq_data + "rfam_sequences"])
                 self.REUSE_ALL = True
+             elif opt == "cmalign-opts":
+                 self.ALIGNOPTS = arg
             elif opt == "--all":
                 self.REUSE_ALL = True
                 self.USE_KNOWN_ISSUES = False
@@ -1103,6 +1113,10 @@ class Pipeline:
                 ncores = min(ncores, int(arg))
             elif opt == "-f" or opt == "--full-inference":
                 self.FULLINFERENCE = True
+             elif opt=="--redundant":
+                 self.REDUNDANT=True
+             elif opt=="--sina":
+                 self.USESINA=True
 
         if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
             print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
@@ -1151,7 +1165,8 @@ class Pipeline:
                                                                     work_infer_mappings, 
                                                                     not self.REUSE_ALL, 
                                                                     allmappings, 
-                                                                     self.FULLINFERENCE
+                                                                     self.FULLINFERENCE,
+                                                                     self.REDUNDANT
                                                               ), 
                                                               full_structures_list, 
                                                               chunksize=1)):
@@ -1340,7 +1355,7 @@ class Pipeline:
         joblist = []
         for f in self.fam_list:
             joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[
-                            self.dl, f, rfam_acc_to_download[f]]))
+                            self.dl, self.USESINA, f, rfam_acc_to_download[f]]))
         try:
             execute_joblist(joblist)
 
@@ -1365,7 +1380,7 @@ class Pipeline:
         joblist = []
         for f in self.fam_list:
             # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1)
-             joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f))
+             joblist.append(Job(function=work_realign, args=[self.USESINA, self.ALIGNOPTS, f], how_many_in_parallel=1, label=f))
 
         # Execute the jobs
         try:
@@ -1753,6 +1768,10 @@ def sql_define_tables(conn):
                 freq_G          REAL,
                 freq_U          REAL,
                 freq_other      REAL,
+                 fields_A        REAL,
+                 fields_C        REAL,
+                 fields_G        REAL,
+                 fields_U        REAL,
                 gap_percent     REAL,
                 consensus       CHAR(1),
                 cons_sec_struct CHAR(1),
@@ -1937,7 +1956,7 @@ def execute_joblist(fulljoblist):
     return results
 
 @trace_unhandled_exceptions
- def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list:
+ def work_infer_mappings(update_only, allmappings, fullinference,redundant, codelist) -> list:
     """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, 
     build a list of Chain() objects mapped to Rfam families, by expanding available mappings 
     of any element of the list to all the list elements.
@@ -1950,8 +1969,8 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
 
     # Split the comma-separated list of chain codes into chain codes:
     eq_class = codelist[0]
-     codes = codelist[1].replace('+', ',').split(',')
- 
+     codes = codelist[2].replace('+', ',').split(',')
+     representative=codelist[1].replace('+', ',').split(',')[0]
     # Search for mappings that apply to an element of this PDB chains list:
     for c in codes:
         # search for Rfam mappings with this chain c:
@@ -2040,6 +2059,13 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
 
         # Now build Chain() objects for the mapped chains
         for c in codes:
+ 
+             if not redundant and c!=representative:
+                 '''
+                 by default save only the representative member
+                 if redundant is passed then save all the chains of the class members
+                 '''
+                 continue
             nr = c.split('|')
             pdb_id = nr[0].lower()
             pdb_model = int(nr[1])
@@ -2202,13 +2228,12 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
     return c
 
 @trace_unhandled_exceptions
- def work_prepare_sequences(dl, rfam_acc, chains):
+ def work_prepare_sequences(dl, useSina, rfam_acc, chains):
     """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.
     """
 
     setproctitle("RNAnet.py work_prepare_sequences()")
- 
-     if rfam_acc in LSU_set | SSU_set:  # rRNA
+     if useSina and rfam_acc in LSU_set | SSU_set:
         if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
             # Detect doublons and remove them
             existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
@@ -2290,109 +2315,123 @@ def work_prepare_sequences(dl, rfam_acc, chains):
     notify(status)
 
 @trace_unhandled_exceptions
- def work_realign(rfam_acc):
-     """ Runs multiple sequence alignements by RNA family.
- 
-     It aligns the Rfam hits from a RNA family with the sequences from the list of chains. 
-     Rfam covariance models are used with Infernal tools, except for rRNAs. 
-     cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs.
+ def use_sina(rfam_acc):
     """
- 
-     setproctitle(f"RNAnet.py work_realign({rfam_acc})")
- 
-     if rfam_acc in LSU_set | SSU_set:
-         # Ribosomal subunits deserve a special treatment.
-         # They require too much RAM to be aligned with Infernal.
-         # Then we will use SINA instead.
-         if rfam_acc in ["RF00177", "RF01960"]:
-             arbfile = "realigned/SSU.arb"
-         else:
-             arbfile = "realigned/LSU.arb"
- 
-         # Run alignment
-         p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
-                                     "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
-                                     "-r", path_to_seq_data + arbfile,
-                                     "--meta-fmt=csv"])
+     When prompted by the user to use SINA the software will use SINA for rRNA SSU and LSU
+     """
+     if rfam_acc in ["RF00177", "RF01960"]:
+         arbfile = "realigned/SSU.arb"
     else:
-         # Align using Infernal for most RNA families
- 
-         if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
-             # Alignment exists. We just want to add new sequences into it.
- 
-             if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
-                 # there are no new sequences to align...
-                 return
+         arbfile = "realigned/LSU.arb"
 
-             existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
-             new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
+     # Run alignment
+     subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
+                             "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
+                             "-r", path_to_seq_data + arbfile,
+                             "--meta-fmt=csv"])
 
-             # Align the new sequences
-             with open(new_ali_path, 'w') as o:
-                 p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 
-                                     "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
-                                     path_to_seq_data + f"realigned/{rfam_acc}.cm", 
-                                     path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
-                                     stdout=o, stderr=subprocess.PIPE)
-             notify("Aligned new sequences together")
- 
-             # Detect doublons and remove them
-             existing_stk = AlignIO.read(existing_ali_path, "stockholm")
-             existing_ids = [r.id for r in existing_stk]
-             del existing_stk
-             new_stk = AlignIO.read(new_ali_path, "stockholm")
-             new_ids = [r.id for r in new_stk]
-             del new_stk
-             doublons = [i for i in existing_ids if i in new_ids]
-             del existing_ids, new_ids
-             if len(doublons):
-                 warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
-                 with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
-                     toremove.write('\n'.join(doublons)+'\n')
-                 p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
-                                    stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-                 p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
-                                    stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-                 os.remove(path_to_seq_data + "realigned/toremove.txt")
- 
-             # And we merge the two alignments
-             p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
-                                  "--rna", existing_ali_path, new_ali_path],
-                                 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-             stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
-             subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
-             notify("Merged alignments into one")
+ @trace_unhandled_exceptions
+ def use_infernal(rfam_acc, alignopts):
+     """
+     Infernal is our default alignment tool except if prompted by the user.
+     Cmalign will be used for all families except when the user prefers to align rRNA with SINA
+     """
+     if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
+         # Alignment exists. We just want to add new sequences into it.
 
-             # remove the partial files
-             os.remove(new_ali_path)
-             os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
+         if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
+             # there are no new sequences to align...
+             return
 
-         else:
-             # Alignment does not exist yet. We need to compute it from scratch.
-             print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
+         existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
+         new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
 
-             p = subprocess.run(["cmalign", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
-                                 "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
+         # Align the new sequences
+         with open(new_ali_path, 'w') as o:
+             p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 
+                                 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
+                                 "-o", path_to_seq_data + f"realigned/{rfam_acc}++.stk",
                                 path_to_seq_data + f"realigned/{rfam_acc}.cm",
-                                 path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
-                                stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-             stderr = p.stderr.decode("utf-8")
- 
-         if len(stderr):
-             print('', flush=True)
-             warn(f"Error during sequence alignment: {stderr}", error=True)
-             with open(runDir + "/errors.txt", "a") as er:
-                 er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
-             return 1
-         else:
-             print('\t'+validsymb, flush=True)
+                                 path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
+                                 stdout=o, stderr=subprocess.PIPE)
+         notify("Aligned new sequences together")
+ 
+         # Detect doublons and remove them
+         existing_stk = AlignIO.read(existing_ali_path, "stockholm")
+         existing_ids = [r.id for r in existing_stk]
+         del existing_stk
+         new_stk = AlignIO.read(new_ali_path, "stockholm")
+         new_ids = [r.id for r in new_stk]
+         del new_stk
+         doublons = [i for i in existing_ids if i in new_ids]
+         del existing_ids, new_ids
+         if len(doublons):
+             warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
+             with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
+                 toremove.write('\n'.join(doublons)+'\n')
+             p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
+                                 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+             p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
+                                 stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+             os.remove(path_to_seq_data + "realigned/toremove.txt")
+ 
+         # And we merge the two alignments
+         p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
+                             "--rna", existing_ali_path, new_ali_path],
+                             stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+         stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
+         subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
+         notify("Merged alignments into one")
+ 
+         # remove the partial files
+         os.remove(new_ali_path)
+         os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
+     else:
+         # Alignment does not exist yet. We need to compute it from scratch.
+         print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
+         
+         cmd = ["cmalign"]
+         if alignopts is not None:
+             cmd += " ".split(alignopts)
+         cmd += ['-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
+                 "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 
+                 "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
+                 path_to_seq_data + f"realigned/{rfam_acc}.cm",
+                 path_to_seq_data + f"realigned/{rfam_acc}++.fa"]
+ 
+         p = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+         stderr = p.stderr.decode("utf-8")
+ 
+     if len(stderr):
+         print('', flush=True)
+         warn(f"Error during sequence alignment: {stderr}", error=True)
+         with open(runDir + "/errors.txt", "a") as er:
+             er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
+         return 1
+     else:
+         print('\t'+validsymb, flush=True)
 
-         # Convert Stockholm to aligned FASTA
-         subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 
+     # Convert Stockholm to aligned FASTA
+     subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 
                         "--informat", "stockholm", 
                         "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
-         subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
+     subprocess.run(["rm", "-f", "esltmp*"]) # We can use a joker here, because we are not running in parallel for this part.
+ 
+ @trace_unhandled_exceptions
+ def work_realign(useSina, alignopts, rfam_acc):
+     """ Runs multiple sequence alignements by RNA family.
 
+     It aligns the Rfam hits from a RNA family with the sequences from the list of chains. 
+     Rfam covariance models are used with Infernal tools or SINA based on the options provided by the user. 
+     Even if the user prefers to use SINA it will be used only for rRNA and cmalign for other families
+     """
+ 
+     setproctitle(f"RNAnet.py work_realign({rfam_acc})")
+     if useSina and rfam_acc in LSU_set | SSU_set:
+         use_sina(rfam_acc)
+     else:
+         use_infernal(rfam_acc, alignopts)
+         
     # Assert everything worked, or save an error
     with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output:
         if not len(output.readline()):
@@ -2402,6 +2441,73 @@ def work_realign(rfam_acc):
                 er.write(f"Failed to realign {rfam_acc} (killed)")
 
 @trace_unhandled_exceptions
+ def work_pydca(f, columns_to_save):
+     """
+     This function writes an alignment file containing only the columns which will be saved to the database,
+     converted to uppercase, and without non-ACGU nucleotides.
+     This file in then used by pydca to compute DCA features, and finally removed.
+     """
+     
+     align=read(path_to_seq_data + f"realigned/{f}++.afa")
+     for s in align:
+         s.seq=s.seq.upper() # Convert to uppercase as needed for pydca
+     filtered_alignment = align[:, 1:1] # all the lines, but no columns
+     for p in columns_to_save: 
+         filtered_alignment += align[:, p-1:p] # save columns one by one
+     
+     # Replace all other letters by a deletion gap just for the 
+     # aim to use pydca as sites other than ACGU . and - are not accepted
+     for s in filtered_alignment:
+         for i in range(len(s.seq)):
+             if s.seq[i].upper() not in "ACGU-.":
+                 s.seq[i]='-'
+ 
+     # Create a fasta file to be used by pydca 
+     with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d:
+         try:
+             only_3d.write(format(filtered_alignment, "fasta"))
+         except ValueError as e:
+             warn(e)
+     
+     # PyDCA instance with options,
+     # Here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequences
+     # The maximum number of iterations is set to 500 for gradient descent
+     # Lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers
+     # Reference:
+     # Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics. 
+     # 2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/
+     plmdca_inst = plmdca.PlmDCA(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa",  
+                                 "rna", seqid = 0.8, lambda_h = 1.0, num_threads = 10, max_iterations = 500)
+     number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2    # L*(L-1)/2 where L=len(columns_to_save)
+     
+     # Tuple of two list of tuples
+     # - the first list contains the fields of sites (nucleotides)
+     # - the second contains pairwise fields (2 nucleotides)
+     # linear distance is zero in order to keep all possible pairs 
+     # because if linear dist=x>0 the pydca will return position |i-j|>x
+     # which will force us to lose a lot of pairs
+     params = plmdca_inst.compute_params(linear_dist=0, num_site_pairs=number_of_sites)
+ 
+     # Fröbenius norm with average product correction
+     fn_apc = plmdca_inst.compute_sorted_FN_APC()
+ 
+     # Save to file
+     np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz", PARAMS=params, FNAPC=fn_apc)
+ 
+     # A dictionary to be used in the function where the frequencies are stored in align_column table
+     return_dict_fields={}
+     for list_fields in params[0]:
+         # The element at 0 is the index 
+         # So taking the value from column to save at that index will give us 
+         # the fields to be stored at align_column in the table
+         return_dict_fields[columns_to_save[list_fields[0]]] = list_fields[1]
+ 
+     # Cleanup
+     subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"])
+ 
+     return return_dict_fields
+ 
+ @trace_unhandled_exceptions
 def work_pssm_remap(f):
     """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
     This also remaps the 3D object sequence with the aligned sequence in the MSA.
@@ -2477,7 +2583,6 @@ def work_pssm_remap(f):
 
     # At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars.
 
- 
     ##########################################################################################
     #           Remap sequences of the 3D chains with sequences in the alignment
     ##########################################################################################
@@ -2560,11 +2665,29 @@ def work_pssm_remap(f):
     # Get a sorted list from the set
     columns = sorted(columns_to_save)
 
+     # Save the re_mappings
+     conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
+     conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
+     sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) 
+                          VALUES (?, ?, ?) 
+                          ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
+                 many=True, data=re_mappings)
+ 
+     # Delete alignment columns that are not used anymore from the database
+     current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
+     unused = []
+     for col in current_family_columns:
+         if col not in columns_to_save:
+             unused.append((f, col))
+     sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
+     conn.commit()
+ 
+ 
     ##########################################################################################
-     #           Save the alignment columns and their mappings to the database
+     #           Retrieve or compute metadata relative to the MSA columns
     ##########################################################################################
 
-     setproctitle(f"RNAnet.py work_pssm_remap({f}) saving")
+     setproctitle(f"RNAnet.py work_pssm_remap({f}) insert/match states")
 
     # Get back the information of match/insertion states from the STK file
     if f not in SSU_set and f not in LSU_set:
@@ -2594,32 +2717,22 @@ def work_pssm_remap(f):
         cm_coords = [ None for x in range(ncols) ]
         cm_2d = [ None for x in range(ncols) ]
 
-     # Save the re_mappings
-     conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
-     conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
-     sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) 
-                          VALUES (?, ?, ?) 
-                          ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
-                 many=True, data=re_mappings)
+     setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA")
 
-     # Delete alignment columns that are not used anymore from the database
-     current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
-     unused = []
-     for col in current_family_columns:
-         if col not in columns_to_save:
-             unused.append((f, col))
-     sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
-     conn.commit()
+     rfam_fields_record = work_pydca(f, columns)
 
-     # Save the useful columns in the database
-     data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
-     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
-                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
+     data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + tuple(rfam_fields_record[j]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
+     sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
+                          VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
                          UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 
-                                     freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
+                                     freq_other=excluded.freq_other, fields_A=excluded.fields_A, fields_C=excluded.fields_C, fields_G=excluded.fields_G, fields_U=excluded.fields_U,
+                                     gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
     # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
-     sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
-                           VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
+     sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other,
+                           fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
+                           VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
+     
+     
     # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
     sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
     conn.close()
@@ -2663,9 +2776,10 @@ def work_save(c, homology=True):
     if homology:
         df = pd.read_sql_query(f"""
                 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord,
-                 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct, dbn,
-                 paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
-                 chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
+                 is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, 
+                 fields_U, gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, 
+                 alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, 
+                 eta, theta, eta_prime, theta_prime, eta_base, theta_base,
                 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM 
                 (SELECT chain_id, rfam_acc from chain WHERE chain_id = {c.db_chain_id})
                 NATURAL JOIN re_mapping
--- a/statistics.py
View file @fa394f4
+++ b/statistics.py
View file @fa394f4
@@ -1206,7 +1206,7 @@ if __name__ == "__main__":
 
             sys.exit()
         elif opt == '--version':
-             print("RNANet statistics 1.3 beta")
+             print("RNANet statistics 1.4 beta")
             sys.exit()
         elif opt == "-r" or opt == "--resolution":
             assert float(arg) > 0.0 and float(arg) <= 20.0