Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
RNANetLegacy
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Network
Create a new issue
Builds
Commits
Authored by
Louis BECQUEY
2020-02-06 17:51:55 +0100
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
48c4d3ee4c1fa0b4965de12c0eaa968d184dd4bb
48c4d3ee
1 parent
5f7f3e62
Correct alignment of mappings
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
66 additions
and
31 deletions
RNAnet.py
RNAnet.py
View file @
48c4d3e
...
...
@@ -723,7 +723,11 @@ def cm_realign(rfam_acc, chains, label):
f
.
write
(
">"
+
record
.
description
+
'
\n
'
+
str
(
record
.
seq
)
+
'
\n
'
)
ids
.
append
(
record
.
id
)
for
c
in
chains
:
f
.
write
(
f
"> {str(c)}
\n
"
+
c
.
seq
.
replace
(
'U'
,
'T'
)
.
replace
(
'-'
,
''
)
+
'
\n
'
)
# We align as DNA
seq_str
=
c
.
seq
.
replace
(
'U'
,
'T'
)
.
replace
(
'-'
,
''
)
# We align as DNA
if
rfam_acc
in
[
"RF02541"
,
"RF02543"
]:
seq_str
=
seq_str
.
replace
(
'P'
,
'U'
)
# Replace pseudo-uridines by uridines. We lose information here but SINA does not accept them.
f
.
write
(
f
"> {str(c)}
\n
"
+
seq_str
+
'
\n
'
)
f
.
close
()
if
rfam_acc
not
in
[
"RF02541"
,
"RF02543"
]:
...
...
@@ -817,52 +821,83 @@ def alignment_nt_stats(f):
# Save colums in the appropriate positions
i
=
0
j
=
0
while
i
<
len
(
c
.
seq
)
and
j
<
alilen
:
warn_gaps
=
False
while
i
<
c
.
full_length
and
j
<
alilen
:
# here we try to map c.seq (the sequence of the 3D chain, including gaps when residues are missing),
# with s.seq, the sequence aligned in the MSA, containing any of ACGUacguP and two types of gaps, - and .
if
c
.
seq
[
i
]
==
s
[
j
]
.
upper
():
# alignment and sequence correspond (incl. gaps)
rfam_acc_to_download
[
f
][
idx
]
.
frequencies
=
np
.
concatenate
((
rfam_acc_to_download
[
f
][
idx
]
.
frequencies
,
frequencies
[:,
j
]
.
reshape
(
-
1
,
1
)),
axis
=
1
)
i
+=
1
j
+=
1
elif
s
[
j
]
in
[
'.'
,
'-'
]:
# gap in the alignment, but not in the real chain
j
+=
1
# ignore the column
elif
c
.
seq
[
i
]
==
'-'
:
# gap in the chain, but the sequence aligns well...
warn
(
f
"gap in {c.chain_label} not re-found in the aligned sequence... Ignoring it."
)
elif
c
.
seq
[
i
]
==
'-'
:
# gap in the chain, but not in the aligned sequence
# search for a gap to the consensus nearby
k
=
0
while
j
+
k
<
alilen
and
s
.
seq
[
j
+
k
]
not
in
[
'A'
,
'C'
,
'G'
,
'U'
,
'a'
,
'c'
,
'g'
,
'u'
,
'P'
]:
if
s
.
seq
[
j
+
k
]
==
'-'
:
break
k
+=
1
# if found, set j to that position
if
j
+
k
<
alilen
and
s
.
seq
[
j
+
k
]
==
'-'
:
j
=
j
+
k
continue
# if not, search for a insertion gap nearby
k
=
0
while
j
+
k
<
alilen
and
s
.
seq
[
j
+
k
]
not
in
[
'A'
,
'C'
,
'G'
,
'U'
,
'a'
,
'c'
,
'g'
,
'u'
,
'P'
]:
if
s
.
seq
[
j
+
k
]
==
'.'
:
break
k
+=
1
# if found, set j to that position
if
j
+
k
<
alilen
and
s
.
seq
[
j
+
k
]
==
'.'
:
j
=
j
+
k
rfam_acc_to_download
[
f
][
idx
]
.
frequencies
=
np
.
concatenate
((
rfam_acc_to_download
[
f
][
idx
]
.
frequencies
,
frequencies
[:,
j
]
.
reshape
(
-
1
,
1
)),
axis
=
1
)
i
+=
1
j
+=
1
continue
# else, just ignore the gap.
warn_gaps
=
True
rfam_acc_to_download
[
f
][
idx
]
.
frequencies
=
np
.
concatenate
((
rfam_acc_to_download
[
f
][
idx
]
.
frequencies
,
np
.
array
([
0.0
,
0.0
,
0.0
,
0.0
,
1.0
])
.
reshape
(
-
1
,
1
)),
axis
=
1
)
i
+=
1
elif
s
.
seq
[
j
]
in
[
'.'
,
'-'
]:
# gap in the alignment, but not in the real chain
j
+=
1
# ignore the column
else
:
print
(
"You are never supposed to reach this:"
,
c
.
seq
,
'
\n
'
,
s
,
flush
=
True
)
print
(
"You are never supposed to reach this."
,
c
.
seq
,
'
\n
'
,
s
.
seq
,
sep
=
''
,
flush
=
True
)
if
warn_gaps
:
warn
(
f
"Some gap(s) in {c.chain_label} were not re-found in the aligned sequence... Ignoring them."
)
# Replace masked positions by the consensus sequence:
s
=
c
.
seq
.
split
()
c_seq
=
c
.
seq
.
split
()
letters
=
[
'A'
,
'C'
,
'G'
,
'U'
,
'N'
]
for
i
in
range
(
len
(
s
)
):
for
i
in
range
(
c
.
full_length
):
if
not
c
.
mask
[
i
]:
freq
=
rfam_acc_to_download
[
f
][
idx
]
.
frequencies
[:,
i
]
s
[
i
]
=
letters
[
freq
.
tolist
()
.
index
(
max
(
freq
))]
rfam_acc_to_download
[
f
][
idx
]
.
seq
=
''
.
join
(
s
)
c_seq
[
i
]
=
letters
[
freq
.
tolist
()
.
index
(
max
(
freq
))]
rfam_acc_to_download
[
f
][
idx
]
.
seq
=
''
.
join
(
c_seq
)
# Saving 'final' datapoint
c
=
rfam_acc_to_download
[
f
][
idx
]
# update the local object
point
=
np
.
zeros
((
13
,
c
.
full_length
))
gaps
=
0
for
i
in
range
(
c
.
full_length
):
point
[
0
,
i
]
=
i
+
1
# position
if
c
.
mask
[
i
]:
# the ith nucleotide exists
# one-hot encoding of the actual sequence
point
[
1
,
i
]
=
int
(
c
.
seq
[
i
-
gaps
]
==
'A'
)
point
[
2
,
i
]
=
int
(
c
.
seq
[
i
-
gaps
]
==
'C'
)
point
[
3
,
i
]
=
int
(
c
.
seq
[
i
-
gaps
]
==
'G'
)
point
[
4
,
i
]
=
int
(
c
.
seq
[
i
-
gaps
]
==
'U'
)
point
[
5
,
i
]
=
int
(
c
.
seq
[
i
-
gaps
]
not
in
[
'A'
,
'C'
,
'G'
,
'U'
])
# save the PSSMs
point
[
6
,
i
]
=
c
.
frequencies
[
0
,
i
-
gaps
]
point
[
7
,
i
]
=
c
.
frequencies
[
1
,
i
-
gaps
]
point
[
8
,
i
]
=
c
.
frequencies
[
2
,
i
-
gaps
]
point
[
9
,
i
]
=
c
.
frequencies
[
3
,
i
-
gaps
]
point
[
10
,
i
]
=
c
.
frequencies
[
4
,
i
-
gaps
]
else
:
gaps
+=
1
point
[
5
,
i
]
=
1.0
# one-hot encoding of the actual sequence
point
[
1
,
i
]
=
int
(
c
.
seq
[
i
]
==
'A'
)
point
[
2
,
i
]
=
int
(
c
.
seq
[
i
]
==
'C'
)
point
[
3
,
i
]
=
int
(
c
.
seq
[
i
]
==
'G'
)
point
[
4
,
i
]
=
int
(
c
.
seq
[
i
]
==
'U'
)
point
[
5
,
i
]
=
int
(
c
.
seq
[
i
]
not
in
[
'A'
,
'C'
,
'G'
,
'U'
])
# save the PSSMs
point
[
6
,
i
]
=
c
.
frequencies
[
0
,
i
]
point
[
7
,
i
]
=
c
.
frequencies
[
1
,
i
]
point
[
8
,
i
]
=
c
.
frequencies
[
2
,
i
]
point
[
9
,
i
]
=
c
.
frequencies
[
3
,
i
]
point
[
10
,
i
]
=
c
.
frequencies
[
4
,
i
]
point
[
11
,
i
]
=
c
.
etas
[
i
]
point
[
12
,
i
]
=
c
.
thetas
[
i
]
file
=
open
(
path_to_3D_data
+
"datapoints/"
+
c
.
chain_label
,
"w"
)
...
...
Please
register
or
login
to post a comment