Merged Khodor-internship to master + general cmalign options

Louis Becquey
Commit 1eff7ed5b6b309e34fb1f87d38e9250cbb298ced 1eff7ed5 2 parents 82eddfba 4846d49a
Showing 2 changed files with 253 additions and 140 deletions
RNAnet.py
statistics.py
--- a/RNAnet.py
View file @1eff7ed
+++ b/RNAnet.py
View file @1eff7ed
@@ -40,7 +40,7 @@ from Bio.SeqIO.FastaIO import FastaIterator, SimpleFastaParser
 from Bio.Seq import MutableSeq
 from Bio.SeqRecord import SeqRecord
 from Bio.Align import MultipleSeqAlignment
-
+from pydca.plmdca import plmdca
 runDir = os.getcwd()
 def trace_unhandled_exceptions(func):
@@ -76,11 +76,11 @@ python_executable = "python"+".".join(platform.python_version().split('.')[:2]) 
 validsymb = '\U00002705'
 warnsymb = '\U000026A0'
 errsymb = '\U0000274C'
-
 LSU_set = {"RF00002", "RF02540", "RF02541",
            "RF02543", "RF02546"}   # From Rfam CLAN 00112
 SSU_set = {"RF00177", "RF02542",  "RF02545",
            "RF01959", "RF01960"}  # From Rfam CLAN 00111
+
 no_nts_set = set()
 weird_mappings = set()
@@ -845,14 +845,14 @@ class Downloader:
             if os.path.isfile(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv"):
                 print("\t> Use of the previous version.\t", end="", flush=True)
             else:
-                return pd.DataFrame([], columns=["class", "class_members"])
+                return pd.DataFrame([], columns=["class","representative","class_members"])
         nrlist = pd.read_csv(path_to_3D_data + f"latest_nr_list_{nr_code}A.csv")
-        full_structures_list = [ tuple(i[1]) for i in nrlist[['class', 'class_members']].iterrows() ]
+        full_structures_list = [ tuple(i[1]) for i in nrlist[["class","representative","class_members"]].iterrows() ]
         print(f"\t{validsymb}", flush=True)
         # The beginning of an adventure.
-        return full_structures_list     # list of ( str (class), str (class_members) )
+        return full_structures_list     # list of ( str (class), str(representative),str (class_members) )
     def download_from_SILVA(self, unit):
@@ -966,6 +966,9 @@ class Pipeline:
         self.RUN_STATS = False
         self.EXTRACT_CHAINS = False
         self.REUSE_ALL = False
+        self.REDUNDANT = False
+        self.ALIGNOPTS = None
+        self.USESINA = False
         self.SELECT_ONLY = None
         self.ARCHIVE = False
         self.SAVELOGS = True
@@ -981,8 +984,9 @@ class Pipeline:
         setproctitle("RNANet.py process_options()")
         try:
-            opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=",  "only=", "maxcores=",
+            opts, _ = getopt.getopt(sys.argv[1:], "r:fhs", ["help", "resolution=", "3d-folder=", "seq-folder=", "keep-hetatm=", 
-                                                            "from-scratch", "full-inference", "no-homology", "ignore-issues", "extract", 
+                                                            "only=", "cmalign-opts=", "maxcores=", "sina", "from-scratch", 
+                                                            "full-inference", "no-homology", "redundant", "ignore-issues", "extract", 
                                                             "all", "no-logs", "archive", "update-homologous", "version"])
         except getopt.GetoptError as err:
             print(err)
@@ -1006,6 +1010,7 @@ class Pipeline:
                 print("--------------------------------------------------------------------------------------------------------------")
                 print("-f [ --full-inference ]\t\tInfer new mappings even if Rfam already provides some. Yields more copies of"
                       "\n\t\t\t\t chains mapped to different families.")
+                print("--redundant\t\t\t\tStore the class members in the database thoughts to be redundant for predictions.")
                 print("-s\t\t\t\tRun statistics computations after completion")
                 print("--extract\t\t\tExtract the portions of 3D RNA chains to individual mmCIF files.")
                 print("--keep-hetatm=False\t\t(True | False) Keep ions, waters and ligands in produced mmCIF files. "
@@ -1022,9 +1027,12 @@ class Pipeline:
                 print("--seq-folder=…\t\t\tPath to a folder to store the sequence and alignment files. Subfolders will be:"
                       "\n\t\t\t\t\trfam_sequences/fasta/\tCompressed hits to Rfam families"
                       "\n\t\t\t\t\trealigned/\t\tSequences, covariance models, and alignments by family")
+                print("--sina\t\t\tForce the RNANet to align large subunit LSU and small subunit SSU ribosomal RNA using sina instead of infernal,"
+                "\n\t\t\t\t\t the other RNA families will be aligned using infernal.")
                 print("--maxcores=…\t\t\tLimit the number of cores to use in parallel portions to reduce the simultaneous"
                       "\n\t\t\t\t need of RAM. Should be a number between 1 and your number of CPUs. Note that portions"
                       "\n\t\t\t\t of the pipeline already limit themselves to 50% or 70% of that number by default.")
+                print("--cmalign-opts=…\t\tA string of additional options to pass to cmalign aligner, e.g. \"--nonbanded --mxsize 2048\"")
                 print("--archive\t\t\tCreate tar.gz archives of the datapoints text files and the alignments,"
                       "\n\t\t\t\t and update the link to the latest archive. ")
                 print("--no-logs\t\t\tDo not save per-chain logs of the numbering modifications")
@@ -1042,8 +1050,8 @@ class Pipeline:
                 print(f"nohup bash -c 'time {fileDir}/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --no-logs' &")
                 sys.exit()
             elif opt == '--version':
-                print("RNANet v1.3 beta, parallelized, Dockerized")
+                print("RNANet v1.4 beta, parallelized, Dockerized")
-                print("Last revision : Jan 2021")
+                print("Last revision : April 2021")
                 sys.exit()
             elif opt == "-r" or opt == "--resolution":
                 assert float(arg) > 0.0 and float(arg) <= 20.0
@@ -1089,6 +1097,8 @@ class Pipeline:
                                 path_to_seq_data + "realigned",
                                 path_to_seq_data + "rfam_sequences"])
                 self.REUSE_ALL = True
+            elif opt == "cmalign-opts":
+                self.ALIGNOPTS = arg
             elif opt == "--all":
                 self.REUSE_ALL = True
                 self.USE_KNOWN_ISSUES = False
@@ -1103,6 +1113,10 @@ class Pipeline:
                 ncores = min(ncores, int(arg))
             elif opt == "-f" or opt == "--full-inference":
                 self.FULLINFERENCE = True
+            elif opt=="--redundant":
+                self.REDUNDANT=True
+            elif opt=="--sina":
+                self.USESINA=True
         if self.HOMOLOGY and "tobedefinedbyoptions" in [path_to_3D_data, path_to_seq_data] or path_to_3D_data == "tobedefinedbyoptions":
             print("usage: RNANet.py --3d-folder path/where/to/store/chains --seq-folder path/where/to/store/alignments")
@@ -1151,7 +1165,8 @@ class Pipeline:
                                                                     work_infer_mappings, 
                                                                     not self.REUSE_ALL, 
                                                                     allmappings, 
-                                                                    self.FULLINFERENCE
+                                                                    self.FULLINFERENCE,
+                                                                    self.REDUNDANT
                                                               ), 
                                                               full_structures_list, 
                                                               chunksize=1)):
@@ -1340,7 +1355,7 @@ class Pipeline:
         joblist = []
         for f in self.fam_list:
             joblist.append(Job(function=work_prepare_sequences, how_many_in_parallel=ncores, args=[
-                           self.dl, f, rfam_acc_to_download[f]]))
+                           self.dl, self.USESINA, f, rfam_acc_to_download[f]]))
         try:
             execute_joblist(joblist)
@@ -1365,7 +1380,7 @@ class Pipeline:
         joblist = []
         for f in self.fam_list:
             # the function already uses all CPUs so launch them one by one (how_many_in_parallel=1)
-            joblist.append(Job(function=work_realign, args=[f], how_many_in_parallel=1, label=f))
+            joblist.append(Job(function=work_realign, args=[self.USESINA, self.ALIGNOPTS, f], how_many_in_parallel=1, label=f))
         # Execute the jobs
         try:
@@ -1753,6 +1768,10 @@ def sql_define_tables(conn):
                 freq_G          REAL,
                 freq_U          REAL,
                 freq_other      REAL,
+                fields_A        REAL,
+                fields_C        REAL,
+                fields_G        REAL,
+                fields_U        REAL,
                 gap_percent     REAL,
                 consensus       CHAR(1),
                 cons_sec_struct CHAR(1),
@@ -1937,7 +1956,7 @@ def execute_joblist(fulljoblist):
     return results
 @trace_unhandled_exceptions
-def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> list:
+def work_infer_mappings(update_only, allmappings, fullinference,redundant, codelist) -> list:
     """Given a list of PDB chains corresponding to an equivalence class from BGSU's NR list, 
     build a list of Chain() objects mapped to Rfam families, by expanding available mappings 
     of any element of the list to all the list elements.
@@ -1950,8 +1969,8 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
     # Split the comma-separated list of chain codes into chain codes:
     eq_class = codelist[0]
-    codes = codelist[1].replace('+', ',').split(',')
+    codes = codelist[2].replace('+', ',').split(',')
-
+    representative=codelist[1].replace('+', ',').split(',')[0]
     # Search for mappings that apply to an element of this PDB chains list:
     for c in codes:
         # search for Rfam mappings with this chain c:
@@ -2040,6 +2059,13 @@ def work_infer_mappings(update_only, allmappings, fullinference, codelist) -> li
         # Now build Chain() objects for the mapped chains
         for c in codes:
+
+            if not redundant and c!=representative:
+                '''
+                by default save only the representative member
+                if redundant is passed then save all the chains of the class members
+                '''
+                continue
             nr = c.split('|')
             pdb_id = nr[0].lower()
             pdb_model = int(nr[1])
@@ -2202,13 +2228,12 @@ def work_build_chain(c, extract, khetatm, retrying=False, save_logs=True):
     return c
 @trace_unhandled_exceptions
-def work_prepare_sequences(dl, rfam_acc, chains):
+def work_prepare_sequences(dl, useSina, rfam_acc, chains):
     """Prepares FASTA files of homologous sequences to realign with cmalign or SINA.
     """
     setproctitle("RNAnet.py work_prepare_sequences()")
-
+    if useSina and rfam_acc in LSU_set | SSU_set:
-    if rfam_acc in LSU_set | SSU_set:  # rRNA
         if os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}++.afa"):
             # Detect doublons and remove them
             existing_afa = AlignIO.read(path_to_seq_data + f"realigned/{rfam_acc}++.afa", "fasta")
@@ -2290,109 +2315,122 @@ def work_prepare_sequences(dl, rfam_acc, chains):
     notify(status)
 @trace_unhandled_exceptions
-def work_realign(rfam_acc):
+def use_sina(rfam_acc):
-    """ Runs multiple sequence alignements by RNA family.
-
-    It aligns the Rfam hits from a RNA family with the sequences from the list of chains. 
-    Rfam covariance models are used with Infernal tools, except for rRNAs. 
-    cmalign requires too much RAM for them, so we use SINA, a specifically designed tool for rRNAs.
     """
-
+    When prompted by the user to use SINA the software will use SINA for rRNA SSU and LSU
-    setproctitle(f"RNAnet.py work_realign({rfam_acc})")
+    """
-
+    if rfam_acc in ["RF00177", "RF01960"]:
-    if rfam_acc in LSU_set | SSU_set:
+        arbfile = "realigned/SSU.arb"
-        # Ribosomal subunits deserve a special treatment.
-        # They require too much RAM to be aligned with Infernal.
-        # Then we will use SINA instead.
-        if rfam_acc in ["RF00177", "RF01960"]:
-            arbfile = "realigned/SSU.arb"
-        else:
-            arbfile = "realigned/LSU.arb"
-
-        # Run alignment
-        p = subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
-                                    "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
-                                    "-r", path_to_seq_data + arbfile,
-                                    "--meta-fmt=csv"])
     else:
-        # Align using Infernal for most RNA families
+        arbfile = "realigned/LSU.arb"
-
-        if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
-            # Alignment exists. We just want to add new sequences into it.
-
-            if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
-                # there are no new sequences to align...
-                return
-            existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
+    # Run alignment
-            new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
+    subprocess.run(["sina", "-i", path_to_seq_data + f"realigned/{rfam_acc}++.fa",
+                            "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa",
+                            "-r", path_to_seq_data + arbfile,
+                            "--meta-fmt=csv"])
-            # Align the new sequences
+@trace_unhandled_exceptions
-            with open(new_ali_path, 'w') as o:
+def use_infernal(rfam_acc, alignopts):
-                p1 = subprocess.run(["cmalign", "--nonbanded", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 
+    """
-                                    "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
+    Infernal is our default alignment tool except if prompted by the user.
-                                    path_to_seq_data + f"realigned/{rfam_acc}.cm", 
+    Cmalign will be used for all families except when the user prefers to align rRNA with SINA
-                                    path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
+    """
-                                    stdout=o, stderr=subprocess.PIPE)
+    if os.path.isfile(path_to_seq_data + "realigned/" + rfam_acc + "++.stk"):
-            notify("Aligned new sequences together")
+        # Alignment exists. We just want to add new sequences into it.
-
-            # Detect doublons and remove them
-            existing_stk = AlignIO.read(existing_ali_path, "stockholm")
-            existing_ids = [r.id for r in existing_stk]
-            del existing_stk
-            new_stk = AlignIO.read(new_ali_path, "stockholm")
-            new_ids = [r.id for r in new_stk]
-            del new_stk
-            doublons = [i for i in existing_ids if i in new_ids]
-            del existing_ids, new_ids
-            if len(doublons):
-                warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
-                with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
-                    toremove.write('\n'.join(doublons)+'\n')
-                p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
-                                   stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-                p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
-                                   stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-                os.remove(path_to_seq_data + "realigned/toremove.txt")
-
-            # And we merge the two alignments
-            p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
-                                 "--rna", existing_ali_path, new_ali_path],
-                                stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
-            stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
-            subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
-            notify("Merged alignments into one")
-            # remove the partial files
+        if not os.path.isfile(path_to_seq_data + f"realigned/{rfam_acc}_new.fa"):
-            os.remove(new_ali_path)
+            # there are no new sequences to align...
-            os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
+            return
-        else:
+        existing_ali_path = path_to_seq_data + f"realigned/{rfam_acc}++.stk"
-            # Alignment does not exist yet. We need to compute it from scratch.
+        new_ali_path = path_to_seq_data + f"realigned/{rfam_acc}_new.stk"
-            print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
-            p = subprocess.run(["cmalign", "--nonbanded", '-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
+        # Align the new sequences
-                                "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
+        with open(new_ali_path, 'w') as o:
+            p1 = subprocess.run(["cmalign", "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 
+                                "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
                                 path_to_seq_data + f"realigned/{rfam_acc}.cm",
-                                path_to_seq_data + f"realigned/{rfam_acc}++.fa"],
+                                path_to_seq_data + f"realigned/{rfam_acc}_new.fa"],
-                               stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+                                stdout=o, stderr=subprocess.PIPE)
-            stderr = p.stderr.decode("utf-8")
+        notify("Aligned new sequences together")
-
+
-        if len(stderr):
+        # Detect doublons and remove them
-            print('', flush=True)
+        existing_stk = AlignIO.read(existing_ali_path, "stockholm")
-            warn(f"Error during sequence alignment: {stderr}", error=True)
+        existing_ids = [r.id for r in existing_stk]
-            with open(runDir + "/errors.txt", "a") as er:
+        del existing_stk
-                er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
+        new_stk = AlignIO.read(new_ali_path, "stockholm")
-            return 1
+        new_ids = [r.id for r in new_stk]
-        else:
+        del new_stk
-            print('\t'+validsymb, flush=True)
+        doublons = [i for i in existing_ids if i in new_ids]
+        del existing_ids, new_ids
+        if len(doublons):
+            warn(f"Removing {len(doublons)} doublons from existing {rfam_acc}++.stk and using their newest version")
+            with open(path_to_seq_data + "realigned/toremove.txt", "w") as toremove:
+                toremove.write('\n'.join(doublons)+'\n')
+            p = subprocess.run(["esl-alimanip", "--seq-r", path_to_seq_data + "realigned/toremove.txt", "-o", existing_ali_path+"2", existing_ali_path],
+                                stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+            p = subprocess.run(["mv", existing_ali_path+"2", existing_ali_path],
+                                stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+            os.remove(path_to_seq_data + "realigned/toremove.txt")
+
+        # And we merge the two alignments
+        p2 = subprocess.run(["esl-alimerge", "-o", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk",
+                            "--rna", existing_ali_path, new_ali_path],
+                            stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+        stderr = p1.stderr.decode('utf-8') + p2.stderr.decode('utf-8')
+        subprocess.run(["mv", path_to_seq_data + f"realigned/{rfam_acc}_merged.stk", existing_ali_path])
+        notify("Merged alignments into one")
+
+        # remove the partial files
+        os.remove(new_ali_path)
+        os.remove(path_to_seq_data + f"realigned/{rfam_acc}_new.fa")
+    else:
+        # Alignment does not exist yet. We need to compute it from scratch.
+        print(f"\t> Aligning {rfam_acc} sequences together (cmalign) ...", end='', flush=True)
+        
+        cmd = ["cmalign"]
+        if alignopts is not None:
+            cmd += " ".split(alignopts)
+        cmd += ['-o', path_to_seq_data + f"realigned/{rfam_acc}++.stk",
+                "--ifile", path_to_seq_data + f"realigned/{rfam_acc}.ins", 
+                "--sfile", path_to_seq_data + f"realigned/{rfam_acc}.tsv",
+                path_to_seq_data + f"realigned/{rfam_acc}.cm",
+                path_to_seq_data + f"realigned/{rfam_acc}++.fa"]
+
+        p = subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE)
+        stderr = p.stderr.decode("utf-8")
+
+    if len(stderr):
+        print('', flush=True)
+        warn(f"Error during sequence alignment: {stderr}", error=True)
+        with open(runDir + "/errors.txt", "a") as er:
+            er.write(f"Attempting to realign {rfam_acc}:\n" + stderr + '\n')
+        return 1
+    else:
+        print('\t'+validsymb, flush=True)
-        # Convert Stockholm to aligned FASTA
+    # Convert Stockholm to aligned FASTA
-        subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 
+    subprocess.run(["esl-reformat", "-o", path_to_seq_data + f"realigned/{rfam_acc}++.afa", 
                         "--informat", "stockholm", 
                         "afa", path_to_seq_data + f"realigned/{rfam_acc}++.stk"])
-        subprocess.run(["rm", "-f", "esltmp*"]) # We can, because we are not running in parallel for this part.
+    subprocess.run(["rm", "-f", "esltmp*"]) # We can use a joker here, because we are not running in parallel for this part.
+
+@trace_unhandled_exceptions
+def work_realign(useSina, alignopts, rfam_acc):
+    """ Runs multiple sequence alignements by RNA family.
+    It aligns the Rfam hits from a RNA family with the sequences from the list of chains. 
+    Rfam covariance models are used with Infernal tools or SINA based on the options provided by the user. 
+    Even if the user prefers to use SINA it will be used only for rRNA and cmalign for other families
+    """
+
+    setproctitle(f"RNAnet.py work_realign({rfam_acc})")
+    if useSina and rfam_acc in LSU_set | SSU_set:
+        use_sina(rfam_acc)
+    else:
+        use_infernal(rfam_acc, alignopts)
+        
     # Assert everything worked, or save an error
     with open(path_to_seq_data + f"realigned/{rfam_acc}++.afa", 'r') as output:
         if not len(output.readline()):
@@ -2402,6 +2440,73 @@ def work_realign(rfam_acc):
                 er.write(f"Failed to realign {rfam_acc} (killed)")
 @trace_unhandled_exceptions
+def work_pydca(f, columns_to_save):
+    """
+    This function writes an alignment file containing only the columns which will be saved to the database,
+    converted to uppercase, and without non-ACGU nucleotides.
+    This file in then used by pydca to compute DCA features, and finally removed.
+    """
+    
+    align=read(path_to_seq_data + f"realigned/{f}++.afa")
+    for s in align:
+        s.seq=s.seq.upper() # Convert to uppercase as needed for pydca
+    filtered_alignment = align[:, 1:1] # all the lines, but no columns
+    for p in columns_to_save: 
+        filtered_alignment += align[:, p-1:p] # save columns one by one
+    
+    # Replace all other letters by a deletion gap just for the 
+    # aim to use pydca as sites other than ACGU . and - are not accepted
+    for s in filtered_alignment:
+        for i in range(len(s.seq)):
+            if s.seq[i].upper() not in "ACGU-.":
+                s.seq[i]='-'
+
+    # Create a fasta file to be used by pydca 
+    with open(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa", "w") as only_3d:
+        try:
+            only_3d.write(format(filtered_alignment, "fasta"))
+        except ValueError as e:
+            warn(e)
+    
+    # PyDCA instance with options,
+    # Here lamda_J is set by pydca to 0.2*(L-1) where L is the length of the sequences
+    # The maximum number of iterations is set to 500 for gradient descent
+    # Lamda_h is set to 1 and seqid is set to 0.8 as suggested by pydca papers
+    # Reference:
+    # Zerihun MB, Pucci F, Peter EK, Schug A. pydca v1. 0: a comprehensive software for Direct Coupling Analysis of RNA and Protein Sequences. Bioinformatics. 
+    # 2020;36(7):2264–2265. 10.1093/bioinformatics/btz892 - DOI - https://pubmed.ncbi.nlm.nih.gov/31778142/
+    plmdca_inst = plmdca.PlmDCA(path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa",  
+                                "rna", seqid = 0.8, lambda_h = 1.0, num_threads = 10, max_iterations = 500)
+    number_of_sites=len(columns_to_save)*(len(columns_to_save)-1)//2    # L*(L-1)/2 where L=len(columns_to_save)
+    
+    # Tuple of two list of tuples
+    # - the first list contains the fields of sites (nucleotides)
+    # - the second contains pairwise fields (2 nucleotides)
+    # linear distance is zero in order to keep all possible pairs 
+    # because if linear dist=x>0 the pydca will return position |i-j|>x
+    # which will force us to lose a lot of pairs
+    params = plmdca_inst.compute_params(linear_dist=0, num_site_pairs=number_of_sites)
+
+    # Fröbenius norm with average product correction
+    fn_apc = plmdca_inst.compute_sorted_FN_APC()
+
+    # Save to file
+    np.savez(path_to_seq_data+f"/realigned/{f}_pydca.npz", PARAMS=params, FNAPC=fn_apc)
+
+    # A dictionary to be used in the function where the frequencies are stored in align_column table
+    return_dict_fields={}
+    for list_fields in params[0]:
+        # The element at 0 is the index 
+        # So taking the value from column to save at that index will give us 
+        # the fields to be stored at align_column in the table
+        return_dict_fields[columns_to_save[list_fields[0]]] = list_fields[1]
+
+    # Cleanup
+    subprocess.run(["rm", "-f", path_to_seq_data+f"/realigned/{f}_filtered_for_pydca.afa"])
+
+    return return_dict_fields
+
+@trace_unhandled_exceptions
 def work_pssm_remap(f):
     """Computes Position-Specific-Scoring-Matrices given the multiple sequence alignment of the RNA family.
     This also remaps the 3D object sequence with the aligned sequence in the MSA.
@@ -2477,7 +2582,6 @@ def work_pssm_remap(f):
     # At this point, pssm_info is a numpy array containing the PSSM and consensus a list of consensus chars.
-
     ##########################################################################################
     #           Remap sequences of the 3D chains with sequences in the alignment
     ##########################################################################################
@@ -2560,11 +2664,29 @@ def work_pssm_remap(f):
     # Get a sorted list from the set
     columns = sorted(columns_to_save)
+    # Save the re_mappings
+    conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
+    conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
+    sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) 
+                         VALUES (?, ?, ?) 
+                         ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
+                many=True, data=re_mappings)
+
+    # Delete alignment columns that are not used anymore from the database
+    current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
+    unused = []
+    for col in current_family_columns:
+        if col not in columns_to_save:
+            unused.append((f, col))
+    sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
+    conn.commit()
+
+
     ##########################################################################################
-    #           Save the alignment columns and their mappings to the database
+    #           Retrieve or compute metadata relative to the MSA columns
     ##########################################################################################
-    setproctitle(f"RNAnet.py work_pssm_remap({f}) saving")
+    setproctitle(f"RNAnet.py work_pssm_remap({f}) insert/match states")
     # Get back the information of match/insertion states from the STK file
     alignstk = AlignIO.read(path_to_seq_data + "realigned/" + f + "++.stk", "stockholm")
@@ -2590,32 +2712,22 @@ def work_pssm_remap(f):
                 cm_2d.append(".")
             cm_coord += 1
-    # Save the re_mappings
+    setproctitle(f"RNAnet.py work_pssm_remap({f}) Potts model, DCA")
-    conn = sqlite3.connect(runDir + '/results/RNANet.db', timeout=20.0)
-    conn.execute('pragma journal_mode=wal') # Allow multiple other readers to ask things while we execute this writing query
-    sql_execute(conn, """INSERT INTO re_mapping (chain_id, index_chain, index_ali) 
-                         VALUES (?, ?, ?) 
-                         ON CONFLICT(chain_id, index_chain) DO UPDATE SET index_ali=excluded.index_ali;""",
-                many=True, data=re_mappings)
-    # Delete alignment columns that are not used anymore from the database
+    rfam_fields_record = work_pydca(f, columns)
-    current_family_columns = [ x[0] for x in sql_ask_database(conn, f"SELECT index_ali FROM align_column WHERE rfam_acc = '{f}';")]
-    unused = []
-    for col in current_family_columns:
-        if col not in columns_to_save:
-            unused.append((f, col))
-    sql_execute(conn, """DELETE FROM align_column WHERE rfam_acc = ? AND index_ali = ?;""", many=True, data=unused)
-    conn.commit()
-    # Save the useful columns in the database
+    data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + tuple(rfam_fields_record[j]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
-    data = [(f, j, cm_coords[j-1]) + tuple(pssm_info[:,j-1]) + (consensus[j-1], cm_2d[j-1]) for j in sorted(columns_to_save)]
+    sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
-    sql_execute(conn, """INSERT INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
+                         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
-                         VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ON CONFLICT(rfam_acc, index_ali) DO 
                          UPDATE SET cm_coord=excluded.cm_coord, freq_A=excluded.freq_A, freq_C=excluded.freq_C, freq_G=excluded.freq_G, freq_U=excluded.freq_U, 
-                                    freq_other=excluded.freq_other, gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
+                                    freq_other=excluded.freq_other, fields_A=excluded.fields_A, fields_C=excluded.fields_C, fields_G=excluded.fields_G, fields_U=excluded.fields_U,
+                                    gap_percent=excluded.gap_percent, consensus=excluded.consensus, cons_sec_struct=excluded.cons_sec_struct;""", many=True, data=data)
     # Add an unknown values column, with index_ali 0 (for nucleotides unsolved in 3D giving a gap '-' but found facing letter in the alignment)
-    sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct)
+    sql_execute(conn, f"""INSERT OR IGNORE INTO align_column (rfam_acc, index_ali, cm_coord, freq_A, freq_C, freq_G, freq_U, freq_other,
-                          VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', "NULL");""", data=(f,))
+                          fields_A, fields_C, fields_G, fields_U, gap_percent, consensus, cons_sec_struct)
+                          VALUES (?, 0, NULL, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, '-', NULL);""", data=(f,))
+    
+    
     # Save the number of "used columns" to table family ( = the length of the alignment if it was composed only of the RNANet chains)
     sql_execute(conn, f"UPDATE family SET ali_filtered_len = ? WHERE rfam_acc = ?;", data=(len(columns_to_save), f))
     conn.close()
@@ -2659,9 +2771,10 @@ def work_save(c, homology=True):
     if homology:
         df = pd.read_sql_query(f"""
                 SELECT index_chain, old_nt_resnum, nt_position, nt_name, nt_code, nt_align_code, cm_coord,
-                is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, gap_percent, consensus, cons_sec_struct, dbn,
+                is_A, is_C, is_G, is_U, is_other, freq_A, freq_C, freq_G, freq_U, freq_other, fields_A, fields_C, fields_G, 
-                paired, nb_interact, pair_type_LW, pair_type_DSSR, alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta,
+                fields_U, gap_percent, consensus, cons_sec_struct, dbn, paired, nb_interact, pair_type_LW, pair_type_DSSR, 
-                chi, bb_type, glyco_bond, form, ssZp, Dp, eta, theta, eta_prime, theta_prime, eta_base, theta_base,
+                alpha, beta, gamma, delta, epsilon, zeta, epsilon_zeta, chi, bb_type, glyco_bond, form, ssZp, Dp, 
+                eta, theta, eta_prime, theta_prime, eta_base, theta_base,
                 v0, v1, v2, v3, v4, amplitude, phase_angle, puckering FROM 
                 (SELECT chain_id, rfam_acc from chain WHERE chain_id = {c.db_chain_id})
                 NATURAL JOIN re_mapping
--- a/statistics.py
View file @1eff7ed
+++ b/statistics.py
View file @1eff7ed
@@ -952,8 +952,8 @@ def par_distance_matrix(filelist, f, label, consider_all_atoms, s):
             else:
                 d[i,j] = get_euclidian_distance(coordinates_with_gaps[i], coordinates_with_gaps[j])
-    if f not in LSU_set and f not in SSU_set:
+    # if f not in LSU_set and f not in SSU_set:
-        np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
+    np.savetxt(runDir + '/results/distance_matrices/' + f + '_'+ label + '/'+ s.id.strip("\'") + '.csv', d, delimiter=",", fmt="%.3f")
     return 1-np.isnan(d).astype(int), np.nan_to_num(d), np.nan_to_num(d*d)
 @trace_unhandled_exceptions
@@ -1169,7 +1169,7 @@ if __name__ == "__main__":
             sys.exit()
         elif opt == '--version':
-            print("RNANet statistics 1.3 beta")
+            print("RNANet statistics 1.4 beta")
             sys.exit()
         elif opt == "-r" or opt == "--resolution":
             assert float(arg) > 0.0 and float(arg) <= 20.0