Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
RNANet
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Aglaé TABOT
2021-06-11 14:49:59 +0200
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
655573b348e0423ff75b2b8578d918fc5e0ceba9
655573b3
1 parent
ca4600da
Modified the function concatenate and added the function dist_atoms_hire_RNA
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
113 additions
and
20 deletions
statistics.py
statistics.py
View file @
655573b
...
...
@@ -1185,11 +1185,6 @@ def distance(coord1, coord2):
"""
Returns the distance between two points using their coordinates (x, y, z)
"""
'''
if (coord1 == [] or coord2 == []) :
return None
else :
'''
return
np
.
sqrt
((
coord1
[
0
]
-
coord2
[
0
])
**
2
+
(
coord1
[
1
]
-
coord2
[
1
])
**
2
+
(
coord1
[
2
]
-
coord2
[
2
])
**
2
)
def
pos_b1
(
res
)
:
...
...
@@ -1278,6 +1273,9 @@ def pos_b2(res):
return
coordb2
def
dist_atoms
(
f
):
'''
Measures the distance between atoms linked by covalent bonds
'''
name
=
str
.
split
(
f
,
'.'
)[
0
]
...
...
@@ -1603,16 +1601,100 @@ def dist_atoms(f):
df
.
to_csv
(
runDir
+
"/results/distances/"
+
'dist_atoms '
+
name
+
'.csv'
)
def
concatenate_dist
():
liste_dist
=
os
.
listdir
(
runDir
+
"/results/distances"
)
df_0
=
pd
.
read_csv
(
os
.
path
.
abspath
(
runDir
+
"/results/distances/"
+
liste_dist
[
0
]))
del
(
liste_dist
[
0
])
def
concatenate
(
chemin
,
liste
,
filename
):
'''
Concatenates the dataframes of liste containing measures
and creates a new dataframe gathering all
'''
liste
=
os
.
listdir
(
runDir
+
chemin
)
df_0
=
pd
.
read_csv
(
os
.
path
.
abspath
(
runDir
+
chemin
+
liste
[
0
]))
del
(
liste
[
0
])
df_tot
=
df_0
for
f
in
liste
_dist
:
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
runDir
+
"/results/distances/"
+
f
))
for
f
in
liste
:
df
=
pd
.
read_csv
(
os
.
path
.
abspath
(
runDir
+
chemin
+
f
))
df_tot
=
pd
.
concat
([
df_tot
,
df
],
ignore_index
=
True
)
df_tot
.
to_csv
(
runDir
+
'/results/distances/'
+
'dist_atomes.csv'
)
df_tot
.
to_csv
(
runDir
+
chemin
+
filename
)
def
dist_atoms_hire_RNA
(
f
)
:
'''
Measures the distance between the atoms of the HiRE-RNA model linked by covalent bonds
'''
name
=
str
.
split
(
f
,
'.'
)[
0
]
liste_dist
=
[]
last_c4p
=
[]
parser
=
MMCIFParser
()
s
=
parser
.
get_structure
(
name
,
os
.
path
.
abspath
(
"/home/data/RNA/3D/rna_only/"
+
f
))
chain
=
next
(
s
[
0
]
.
get_chains
())
os
.
makedirs
(
runDir
+
"/results/distances_hRNA/"
,
exist_ok
=
True
)
for
res
in
chain
:
p_o5p
=
None
o5p_c5p
=
None
c5p_c4p
=
None
c4p_c1p
=
None
c1p_b1
=
None
b1_b2
=
None
last_c4p_p
=
np
.
nan
if
res
.
get_resname
()
not
in
[
'ATP'
,
'CCC'
,
'A3P'
,
'A23'
,
'GDP'
,
'RIA'
]
:
#several phosphate groups, ignore
atom_p
=
[
atom
.
get_coord
()
for
atom
in
res
if
atom
.
get_name
()
==
"P"
]
atom_o5p
=
[
atom
.
get_coord
()
for
atom
in
res
if
"O5'"
in
atom
.
get_fullname
()
]
atom_c5p
=
[
atom
.
get_coord
()
for
atom
in
res
if
"C5'"
in
atom
.
get_fullname
()
]
atom_c4p
=
[
atom
.
get_coord
()
for
atom
in
res
if
"C4'"
in
atom
.
get_fullname
()
]
atom_c1p
=
[
atom
.
get_coord
()
for
atom
in
res
if
"C1'"
in
atom
.
get_fullname
()
]
atom_b1
=
pos_b1
(
res
)
#position b1 to be calculated, depending on the case
atom_b2
=
pos_b2
(
res
)
#position b2 to be calculated only for those with 2 cycles
if
len
(
last_c4p
)
<
1
or
len
(
atom_p
)
<
1
or
f
!=
f_prec
:
#link with the previous residue in the chain
last_c4p_p
=
last_c4p_p
else
:
if
distance
(
last_c4p
[
0
],
atom_p
[
0
])
>
5
:
last_c4p_p
=
last_c4p_p
else
:
last_c4p_p
=
distance
(
last_c4p
[
0
],
atom_p
[
0
])
if
len
(
atom_p
)
<
1
or
len
(
atom_o5p
)
<
1
:
p_o5p
=
p_o5p
else
:
p_o5p
=
distance
(
atom_p
[
0
],
atom_o5p
[
0
])
if
len
(
atom_c5p
)
<
1
or
len
(
atom_o5p
)
<
1
:
o5p_c5p
=
o5p_c5p
else
:
o5p_c5p
=
distance
(
atom_o5p
[
0
],
atom_c5p
[
0
])
if
len
(
atom_c5p
)
<
1
or
len
(
atom_c4p
)
<
1
:
c5p_c4p
=
c5p_c4p
else
:
c5p_c4p
=
distance
(
atom_c5p
[
0
],
atom_c4p
[
0
])
if
len
(
atom_c4p
)
<
1
or
len
(
atom_c1p
)
<
1
:
c4p_c1p
=
c4p_c1p
else
:
c4p_c1p
=
distance
(
atom_c4p
[
0
],
atom_c1p
[
0
])
if
len
(
atom_c1p
)
<
1
or
len
(
atom_b1
)
<
1
:
c1p_b1
=
c1p_b1
else
:
c1p_b1
=
distance
(
atom_c1p
[
0
],
atom_b1
)
if
len
(
atom_b1
)
<
1
or
len
(
atom_b2
)
<
1
:
b1_b2
=
b1_b2
else
:
b1_b2
=
distance
(
atom_b1
,
atom_b2
)
last_c4p
=
atom_c4p
f_prec
=
f
liste_dist
.
append
([
res
.
get_resname
(),
last_c4p_p
,
p_o5p
,
o5p_c5p
,
c5p_c4p
,
c4p_c1p
,
c1p_b1
,
b1_b2
])
df
=
pd
.
DataFrame
(
liste_dist
,
columns
=
[
"Residu"
,
"C4'-P"
,
"P-O5'"
,
"O5'-C5'"
,
"C5'-C4'"
,
"C4'-C1'"
,
"C1'-B1"
,
"B1-B2"
])
df
.
to_csv
(
runDir
+
'/results/distances_hRNA/'
+
'dist_atoms_hire_RNA '
+
name
+
'.csv'
)
...
...
@@ -1672,7 +1754,7 @@ if __name__ == "__main__":
# Load mappings. famlist will contain only families with structures at this resolution threshold.
'''
print("Loading mappings list...")
with sqlite3.connect(runDir + "/results/RNANet.db") as conn:
conn.execute('pragma journal_mode=wal')
...
...
@@ -1696,7 +1778,7 @@ if __name__ == "__main__":
print(f"Found {len(famlist)} families with chains of resolution {res_thr}A or better.")
if len(ignored):
print(f"Idty matrices: Ignoring {len(ignored)} families with only one chain:", " ".join(ignored)+'
\n
')
'''
if
DELETE_OLD_DATA
:
for
f
in
famlist
:
subprocess
.
run
([
"rm"
,
"-f"
,
runDir
+
f
"/data/{f}.npy"
,
runDir
+
f
"/data/{f}_pairs.csv"
,
runDir
+
f
"/data/{f}_counts.csv"
])
...
...
@@ -1714,7 +1796,7 @@ if __name__ == "__main__":
# Define the tasks
joblist
=
[]
'''
if n_unmapped_chains and DO_WADLEY_ANALYSIS:
joblist.append(Job(function=reproduce_wadley_results, args=(1, False, (1,4), res_thr)))
joblist.append(Job(function=reproduce_wadley_results, args=(4, False, (1,4), res_thr)))
...
...
@@ -1736,14 +1818,25 @@ if __name__ == "__main__":
joblist.append(Job(function=parallel_stats_pairs, args=(f,))) # updates the database
if f not in ignored:
joblist.append(Job(function=to_id_matrix, args=(f,))) # updates the database
'''
#dist_atoms(os.listdir(path_to_3D_data + "rna_only")[0])
'''
f_prec=os.listdir(path_to_3D_data + "rna_only")[0]
#exit()
for f in os.listdir(path_to_3D_data + "rna_only")[:100]:
joblist.append(Job(function=dist_atoms, args=(f,)))
'''
#dist_atoms_hire_RNA(os.listdir(path_to_3D_data + "rna_only")[0])
#concatenate('/results/distances/', os.listdir(runDir+'/results/distances/'), 'dist_atoms.csv')
#exit()
f_prec
=
os
.
listdir
(
path_to_3D_data
+
"rna_only"
)[
0
]
for
f
in
os
.
listdir
(
path_to_3D_data
+
"rna_only"
)[:
100
]:
joblist
.
append
(
Job
(
function
=
dist_atoms
,
args
=
(
f
,)))
joblist
.
append
(
Job
(
function
=
dist_atoms_hire_RNA
,
args
=
(
f
,)))
p
=
Pool
(
initializer
=
init_worker
,
initargs
=
(
tqdm
.
get_lock
(),),
processes
=
nworkers
)
pbar
=
tqdm
(
total
=
len
(
joblist
),
desc
=
"Stat jobs"
,
position
=
0
,
unit
=
"job"
,
leave
=
True
)
...
...
@@ -1774,12 +1867,12 @@ if __name__ == "__main__":
print
()
print
()
#concatenate_dist()
# finish the work after the parallel portions
# finish the work after the parallel portions
'''
per_chain_stats()
seq_idty()
stats_pairs()
if n_unmapped_chains:
general_stats()
\ No newline at end of file
'''
\ No newline at end of file
...
...
Please
register
or
login
to post a comment