Revision 1 for Bioinformatics completed

Louis BECQUEY
Commit 7196427d1340f4cf45c6878c26e440a111e7e55d 7196427d 1 parent d266c643
Showing 7 changed files with 10 additions and 6 deletions
.gitignore
README.md
RNAnet.py
known_issues.txt
known_issues_reasons.txt
regression.py
statistics.py
--- a/.gitignore
View file @7196427
+++ b/.gitignore
View file @7196427
@@ -12,4 +12,5 @@ esl*
 # environment stuff
 .vscode/
-*.pyc
\ No newline at end of file
+*.pyc
+__pycache__/
\ No newline at end of file
--- a/README.md
View file @7196427
+++ b/README.md
View file @7196427
@@ -94,6 +94,8 @@ The detailed list of options is below:
 -h [ --help ]                   Print this help message
 --version                       Print the program version
+-f [ --full-inference ]         Infer new 3D->family mappings even if Rfam already provides some. Yields more copies of chains
+                                mapped to different families.
 -r 4.0 [ --resolution=4.0 ]     Maximum 3D structure resolution to consider a RNA chain.
 -s                              Run statistics computations after completion
 --extract                       Extract the portions of 3D RNA chains to individual mmCIF files.
@@ -105,7 +107,7 @@ The detailed list of options is below:
                                         RNAcifs/                Full structures containing RNA, in mmCIF format
                                         rna_mapped_to_Rfam/     Extracted 'pure' RNA chains
                                         datapoints/             Final results in CSV file format.
---seq-folder=…                  Path to a folder to store the sequence and alignment files.
+--seq-folder=…                  Path to a folder to store the sequence and alignment files. Subfolders will be:
                                         rfam_sequences/fasta/   Compressed hits to Rfam families
                                         realigned/              Sequences, covariance models, and alignments by family
 --no-homology                   Do not try to compute PSSMs and do not align sequences.
@@ -117,11 +119,12 @@ The detailed list of options is below:
 --update-homologous             Re-download Rfam and SILVA databases, realign all families, and recompute all CSV files
 --from-scratch                  Delete database, local 3D and sequence files, and known issues, and recompute.
 --archive                       Create a tar.gz archive of the datapoints text files, and update the link to the latest archive
+--no-logs                       Do not save per-chain logs of the numbering modifications
 ```
 Typical usage:
 ```
-nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s --archive' &
+nohup bash -c 'time ~/Projects/RNANet/RNAnet.py --3d-folder ~/Data/RNA/3D/ --seq-folder ~/Data/RNA/sequences -s' &
 ```
 ## Post-computation task: estimate quality
--- a/RNAnet.py
View file @7196427
+++ b/RNAnet.py
View file @7196427
--- a/known_issues.txt
View file @7196427
+++ b/known_issues.txt
View file @7196427
--- a/known_issues_reasons.txt
View file @7196427
+++ b/known_issues_reasons.txt
View file @7196427
--- a/regression.py
View file @7196427
+++ b/regression.py
View file @7196427
@@ -11,7 +11,7 @@
 #   - Use a specialised database (SILVA) : better alignments (we guess?), but two kind of jobs
 #   - Use cmalign --small everywhere (homogeneity)
 # Moreover, --small requires --nonbanded --cyk, which means the output alignement is the optimally scored one. 
-# To date, we trust Infernal as the best tool to realign RNA. Is it ?
+# To date, we trust Infernal as the best tool to realign ncRNA. Is it ?
 # Contact: louis.becquey@univ-evry.fr (PhD student), fariza.tahi@univ-evry.fr (PI)
@@ -28,7 +28,7 @@ pd.set_option('display.max_rows', None)
 LSU_set = ["RF00002", "RF02540", "RF02541", "RF02543", "RF02546"]   # From Rfam CLAN 00112
 SSU_set = ["RF00177", "RF02542",  "RF02545", "RF01959", "RF01960"]  # From Rfam CLAN 00111
-with sqlite3.connect("results/RNANet.db") as conn:
+with sqlite3.connect(os.getcwd()+"/results/RNANet.db") as conn:
     df = pd.read_sql("SELECT rfam_acc, max_len, nb_total_homol, comput_time, comput_peak_mem FROM family;", conn)
 to_remove = [ f for f in df.rfam_acc if f in LSU_set+SSU_set ]
@@ -74,7 +74,7 @@ ax.set_ylabel("Maximum length of sequences ")
 ax.set_zlabel("Computation time (s)")
 plt.subplots_adjust(wspace=0.4)
-plt.savefig("results/cmalign_jobs_performance.png")
+plt.savefig(os.getcwd()+"/results/cmalign_jobs_performance.png")
 # # ========================================================
 # # Linear Regression of max_mem as function of max_length
--- a/statistics.py
View file @7196427
+++ b/statistics.py
View file @7196427