Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Louis BECQUEY
/
biorseo
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Network
Create a new issue
Builds
Commits
Authored by
Louis BECQUEY
2019-02-26 11:08:26 +0100
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
e6293064bb8c94c1e49f5844fede762923dc2cd8
e6293064
1 parent
d7ca4934
pattern matching for desc motifs
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
79 additions
and
35 deletions
Makefile
cppsrc/Motif.cpp
cppsrc/biominserter.cpp
Makefile
View file @
e629306
...
...
@@ -9,7 +9,7 @@ TARGET = biominserter
CC
=
clang++
# compiling flags here
CFLAGS
=
-Icppsrc/ -I/usr/local/include -I
$(ICONCERT)
-I
$(ICPLEX)
-I
$(INUPACK)
-I
$(IEIGEN)
-
O3
CFLAGS
=
-Icppsrc/ -I/usr/local/include -I
$(ICONCERT)
-I
$(ICPLEX)
-I
$(INUPACK)
-I
$(IEIGEN)
-
g
CXXFLAGS
=
--std
=
c++17 -Wall -Wpedantic -Wextra -Wno-ignored-attributes -Wno-unused-variable
LINKER
=
clang++
...
...
@@ -36,6 +36,12 @@ $(OBJECTS): $(OBJDIR)/%.o : $(SRCDIR)/%.cpp $(INCLUDES)
$(CC)
-c
$(CFLAGS)
$(CXXFLAGS)
$<
-o
$@
@
echo
"
\0
33[00;32mCompiled "
$<
".
\0
33[00m"
.PHONY
:
all
all
:
$(BINDIR)/$(TARGET)
.PHONY
:
re
re
:
remove clean all
.PHONY
:
clean
clean
:
$(rm)
$(OBJECTS)
...
...
cppsrc/Motif.cpp
View file @
e629306
...
...
@@ -39,19 +39,26 @@ vector<Motif> Motif::build_from_desc(const string& descfile, string rna)
vector
<
string
>
bases
;
int
last
;
vector
<
Motif
>
results
;
char
c
=
'a'
;
char
*
prev
=
&
c
;
motif
=
std
::
ifstream
(
descfile
);
std
::
getline
(
motif
,
line
);
// ignore "id: number"
std
::
getline
(
motif
,
line
);
// Bases: 866_G 867_G 868_G 869_G 870_U 871_A ...
split
(
bases
,
line
,
boost
::
is_any_of
(
" "
));
// get a vector of 866_G, 867_G, etc...
seq
=
bases
[
1
].
back
();
std
::
getline
(
motif
,
line
);
// ignore "id: number"
std
::
getline
(
motif
,
line
);
// Bases: 866_G 867_G 868_G 869_G 870_U 871_A ...
boost
::
split
(
bases
,
line
,
[
prev
](
char
c
)
{
bool
res
=
(
*
prev
==
' '
or
*
prev
==
':'
);
*
prev
=
c
;
return
(
c
==
' '
and
res
);
});
// get a vector of 866_G, 867_G, etc...
seq
=
bases
[
1
].
substr
(
bases
[
1
].
find
(
'_'
)
+
1
,
1
);
last
=
std
::
stoi
(
bases
[
1
].
substr
(
0
,
bases
[
1
].
find
(
'_'
)));
for
(
vector
<
string
>::
iterator
b
=
bases
.
begin
()
+
2
;
b
!=
bases
.
end
()
;
b
++
)
{
char
nt
=
b
->
back
();
for
(
vector
<
string
>::
iterator
b
=
bases
.
begin
()
+
1
;
b
!=
bases
.
end
()
-
1
;
b
++
)
{
char
nt
=
b
->
substr
(
b
->
find
(
'_'
)
+
1
,
1
).
back
();
int
pos
=
std
::
stoi
(
b
->
substr
(
0
,
b
->
find
(
'_'
)));
if
(
pos
-
last
>
5
)
{
// finish this component and start a new one
seq
+=
nt
;
component_sequences
.
push_back
(
seq
);
seq
=
""
;
}
else
if
(
pos
-
last
==
2
)
{
...
...
@@ -63,7 +70,7 @@ vector<Motif> Motif::build_from_desc(const string& descfile, string rna)
}
else
if
(
pos
-
last
==
5
)
{
seq
+=
"...."
;
}
seq
+=
nt
;
if
(
pos
-
last
<=
5
)
seq
+=
nt
;
}
// Now component_sequences is a vector of sequences like {AGCGC, CGU..GUUU}
...
...
@@ -74,6 +81,7 @@ vector<Motif> Motif::build_from_desc(const string& descfile, string rna)
for
(
vector
<
Component
>&
v
:
vresults
)
{
results
.
push_back
(
Motif
(
v
,
descfile
.
substr
(
0
,
descfile
.
find
(
".desc"
))));
}
std
::
cout
<<
"
\t
>returning vector of size "
<<
results
.
size
()
<<
std
::
endl
;
return
results
;
}
...
...
@@ -115,24 +123,47 @@ vector<vector<Component>> Motif::find_next_ones_in(string rna, vector<string> vc
pair
<
uint
,
uint
>
pos
;
vector
<
vector
<
Component
>>
results
;
vector
<
vector
<
Component
>>
next_ones
;
vector
<
string
>
next_seqs
(
&
vc
[
1
],
&
vc
[
vc
.
size
()
-
1
]);
vector
<
string
>
next_seqs
;
if
(
vc
.
size
()
>
1
)
{
if
(
vc
.
size
()
>
2
)
next_seqs
=
vector
<
string
>
(
&
vc
[
1
],
&
vc
[
vc
.
size
()
-
1
]);
else
next_seqs
=
vector
<
string
>
(
1
,
vc
.
back
());
std
::
regex_search
(
rna
,
matches
,
c
);
std
::
regex_search
(
rna
,
matches
,
c
);
for
(
uint
i
=
0
;
i
<
matches
.
size
();
++
i
)
// Pour chacun des matches
{
pos
.
first
=
matches
.
position
(
i
);
pos
.
second
=
matches
.
length
(
i
)
+
pos
.
first
-
1
;
std
::
cout
<<
"
\t\t
>We can insert "
<<
vc
[
0
]
<<
" in ["
<<
pos
.
first
<<
','
<<
pos
.
second
<<
']'
<<
std
::
endl
;
std
::
cout
<<
"
\t\t
>Now searching in "
<<
rna
.
substr
(
pos
.
second
+
1
)
<<
std
::
endl
;
next_ones
=
find_next_ones_in
(
rna
.
substr
(
pos
.
second
+
1
),
next_seqs
);
for
(
vector
<
Component
>
v
:
next_ones
)
// Pour chacune des combinaisons suivantes
{
// Combiner le match et la combinaison suivante
vector
<
Component
>
r
;
r
.
push_back
(
Component
(
pos
));
for
(
Component
&
c
:
v
)
r
.
push_back
(
c
);
results
.
push_back
(
r
);
}
}
}
else
{
std
::
regex_search
(
rna
,
matches
,
c
);
for
(
uint
i
=
0
;
i
<
matches
.
size
();
++
i
)
// Pour chacun des matches
{
pos
.
first
=
matches
.
position
(
i
);
pos
.
second
=
matches
.
length
(
i
)
+
pos
.
first
-
1
;
next_ones
=
find_next_ones_in
(
rna
.
substr
(
pos
.
second
+
1
),
next_seqs
);
for
(
vector
<
Component
>
v
:
next_ones
)
// Pour chacune des combinaisons suivantes
for
(
uint
i
=
0
;
i
<
matches
.
size
();
++
i
)
// Pour chacun des matches
{
pos
.
first
=
matches
.
position
(
i
);
pos
.
second
=
matches
.
length
(
i
)
+
pos
.
first
-
1
;
std
::
cout
<<
"
\t\t
>We can insert "
<<
vc
[
0
]
<<
" in ["
<<
pos
.
first
<<
','
<<
pos
.
second
<<
']'
<<
std
::
endl
;
// Combiner le match et la combinaison suivante
vector
<
Component
>
r
;
r
.
push_back
(
Component
(
pos
));
for
(
Component
&
c
:
v
)
r
.
push_back
(
c
);
results
.
push_back
(
r
);
}
}
std
::
cout
<<
"
\t
> returning vector of size "
<<
results
.
size
()
<<
std
::
endl
;
return
results
;
}
...
...
@@ -143,6 +174,8 @@ vector<Motif> load_desc_folder(const string& path, const string& rna, bool verbo
if
(
!
exists
(
path
))
{
std
::
cerr
<<
"Hmh, i can't find that folder: "
<<
path
<<
std
::
endl
;
return
posInsertionSites
;
}
else
{
if
(
verbose
)
std
::
cout
<<
"loading DESC motifs from "
<<
path
<<
"..."
<<
std
::
endl
;
}
for
(
auto
it
:
recursive_directory_range
(
path
))
{
...
...
@@ -176,16 +209,22 @@ bool is_desc_insertible(const string& descfile, const string& rna, bool verbose)
string
seq
;
vector
<
string
>
bases
;
int
last
;
char
c
=
'a'
;
char
*
prev
=
&
c
;
motif
=
std
::
ifstream
(
descfile
);
std
::
getline
(
motif
,
line
);
// ignore "id: number"
std
::
getline
(
motif
,
line
);
// Bases: 866_G 867_G 868_G 869_G 870_U 871_A ...
split
(
bases
,
line
,
boost
::
is_any_of
(
" "
));
// get a vector of 866_G, 867_G, etc...
seq
=
bases
[
1
].
back
();
std
::
getline
(
motif
,
line
);
// ignore "id: number"
std
::
getline
(
motif
,
line
);
// Bases: 866_G 867_G 868_G 869_G 870_U 871_A ...
boost
::
split
(
bases
,
line
,
[
prev
](
char
c
)
{
bool
res
=
(
*
prev
==
' '
or
*
prev
==
':'
);
*
prev
=
c
;
return
(
c
==
' '
and
res
);
});
// get a vector of 866_G, 867_G, etc...
seq
=
""
;
last
=
std
::
stoi
(
bases
[
1
].
substr
(
0
,
bases
[
1
].
find
(
'_'
)));
for
(
vector
<
string
>::
iterator
b
=
bases
.
begin
()
+
2
;
b
!=
bases
.
end
(
);
b
++
)
{
char
nt
=
b
->
back
();
for
(
vector
<
string
>::
iterator
b
=
(
bases
.
begin
()
+
1
);
b
!=
(
bases
.
end
()
-
1
);
b
++
)
{
char
nt
=
b
->
substr
(
b
->
find
(
'_'
)
+
1
,
1
).
back
();
int
pos
=
std
::
stoi
(
b
->
substr
(
0
,
b
->
find
(
'_'
)));
if
(
pos
-
last
>
5
)
{
// finish this component and start a new one
...
...
@@ -206,16 +245,15 @@ bool is_desc_insertible(const string& descfile, const string& rna, bool verbose)
std
::
regex
e
(
seq
);
if
(
std
::
regex_search
(
rna
,
m
,
e
))
{
if
(
verbose
)
std
::
cout
<<
"Motif "
<<
descfile
.
substr
(
0
,
descfile
.
find
(
".desc"
))
<<
" "
<<
seq
<<
" can be inserted."
<<
std
::
endl
;
std
::
cout
<<
"
\t
>Motif "
<<
descfile
.
substr
(
0
,
descfile
.
find
(
".desc"
))
<<
"
\t
"
<<
seq
<<
" can be inserted."
<<
std
::
endl
;
return
true
;
}
else
{
if
(
verbose
)
std
::
cout
<<
"Ignoring motif "
<<
descfile
.
substr
(
0
,
descfile
.
find
(
".desc"
))
<<
" "
<<
seq
<<
std
::
endl
;
// if (verbose) std::cout << "Ignoring motif " << descfile.substr(0, descfile.find(".desc")) << " \t" << seq << std::endl;
return
false
;
}
}
bool
operator
==
(
const
Component
&
c1
,
const
Component
&
c2
)
{
if
(
c1
.
pos
.
first
!=
c2
.
pos
.
first
)
return
false
;
...
...
cppsrc/biominserter.cpp
View file @
e629306
...
...
@@ -56,14 +56,14 @@ int main(int argc, char* argv[])
if
(
argc
!=
6
)
{
cerr
<<
argc
<<
" arguments specified !"
<<
endl
;
cerr
<<
"Please specify the following input files:"
<<
endl
;
cerr
<<
"biominserter sequence.fasta
insertion.sites.csv
prob_threshold verbose obj"
<<
endl
;
cerr
<<
"biominserter sequence.fasta
motifs_file_or_DESC_folder
prob_threshold verbose obj"
<<
endl
;
return
EXIT_FAILURE
;
}
/* VARIABLE DECLARATIONS */
const
char
*
inputName
=
argv
[
1
];
const
char
*
csvname
=
argv
[
2
];
const
char
*
motifs_path_name
=
argv
[
2
];
bool
verbose
=
(
atoi
(
argv
[
4
])
!=
0
);
string
basename
=
remove_ext
(
inputName
,
'.'
,
'/'
);
float
theta_p_threshold
=
atof
(
argv
[
3
]);
...
...
@@ -89,13 +89,13 @@ int main(int argc, char* argv[])
if
(
verbose
)
cout
<<
"
\t
>"
<<
inputName
<<
" successfuly loaded ("
<<
myRNA
.
get_RNA_length
()
<<
" nt)"
<<
endl
;
// load CSV file
if
(
access
(
csv
name
,
F_OK
)
==
-
1
)
{
cerr
<<
csv
name
<<
" not found"
<<
endl
;
if
(
access
(
motifs_path_
name
,
F_OK
)
==
-
1
)
{
cerr
<<
motifs_path_
name
<<
" not found"
<<
endl
;
return
EXIT_FAILURE
;
}
posInsertionSites
=
load_desc_folder
(
csv
name
,
fa
->
seq
(),
verbose
);
posInsertionSites
=
load_desc_folder
(
motifs_path_
name
,
fa
->
seq
(),
verbose
);
if
(
verbose
)
cout
<<
"
\t
>"
<<
csv
name
<<
" successfuly loaded ("
<<
posInsertionSites
.
size
()
<<
" insertion sites)"
<<
endl
;
cout
<<
"
\t
>"
<<
motifs_path_
name
<<
" successfuly loaded ("
<<
posInsertionSites
.
size
()
<<
" insertion sites)"
<<
endl
;
exit
(
0
);
/* FIND PARETO SET */
...
...
Please
register
or
login
to post a comment