-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcompress.cpp
More file actions
169 lines (129 loc) · 4.25 KB
/
compress.cpp
File metadata and controls
169 lines (129 loc) · 4.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#include "compress.h"
using std::cout;
using std::endl;
using std::vector;
using std::pair;
using std::string;
Sequence::Sequence(){
}
Sequence::Sequence(string& inputSequence){
for(uint i(0);i<inputSequence.size();i++){
switch (inputSequence[i]){
case 'A': s.push_back(false); s.push_back(false);break;
case 'C': s.push_back(false); s.push_back(true);break;
case 'G': s.push_back(true); s.push_back(false);break;
default: s.push_back(true); s.push_back(true);break;
}
}
}
Sequence::Sequence(vector<bool> &inputVector){
s = inputVector;
}
string Sequence::str() const{
string str(s.size()/2, 'N');
uint j = 0;
for(uint i(0);i<s.size();i+=2){
if(s[i]){
if(s[i+1]){
str[j] = 'T';
}else{
str[j] = 'G';
}
}else{
if(s[i+1]){
str[j] = 'C';
}else{
str[j] = 'A';
}
}
j++;
}
return str;
}
Sequence Sequence::reverse_complement() const{
std::vector<bool> res;
for (int i = s.size()-1 ; i>0 ; i-=2){
res.push_back(not s[i-1]);
res.push_back(not s[i]);
}
Sequence r (res);
return r;
}
//takes a subset of a sequence, with argument the position on the sequence and the number of nucleotides
Sequence Sequence::subseq(int start, int length){
//this verification might be useful for debugging, but it slows down the program A LOT
// if (start*2+length*2 > seq.size()){
// cout << "ERROR in subseq, the sequence is too short " << seq.size() << " " << start << " " << length << endl;
// }
vector<bool>::const_iterator st = s.begin() + 2*start;
vector<bool>::const_iterator f = s.begin() + 2*length+2*start;
vector<bool> res(st, f);
//auto s = seq.begin();
return Sequence(res);
}
bool operator==(Sequence const &a , Sequence const &b){
return a.s == b.s;
}
size_t Sequence::size(){
return size_t(s.size()/2);
}
//returns true if the range [start1,start1+k) less than [start2, start2+k)
bool Sequence::compare_kmers(int start1, int start2, int k){
return not lexicographical_compare(s.begin()+start1*2, s.begin()+(start1+k)*2,s.begin()+start2*2, s.begin()+(start2+k)*2);
}
//returns in a deterministic fashion at least one read in all windows w, on average 1 / 2^hardness read in total
void Sequence::minimisers(int hardness, int k, int w, vector<vector<int>> &minis){
int num_threads = minis.size();
std::hash<vector<bool>> h;
vector<bool> criterion (hardness, false);
vector <bool> emptyWindows (this->size()-w+1, true);
bool cont;
do{
int lastM = 0;
int lastEmpty = -w; //a variable to keep track of which windows need a minimizer
//in empty windows, look for kmers matching the criterion
for (int i = 0 ; i<=this->size()-k ; i++){
int emptyWindowsSize = emptyWindows.size();
if (i<emptyWindowsSize&& emptyWindows[i]) lastEmpty = i;
if (i-lastEmpty < w){
bool good = true;
short index = 0;
while (good && index < hardness){
good = good && (s[i*2+index]==criterion[index]);
index ++;
}
if (good) { //found a minimiser !
minis[this->subseq(i,k).hash()%num_threads].push_back(i);
for (int j = std::max(lastM, i-w) ; j <= std::min(i, emptyWindowsSize) ; j++){
emptyWindows[j] = false;
}
lastM = i;
}
}
}
//check if there is still an empty window
cont = false;
short index = 0;
while (!cont && index<emptyWindows.size()){
cont = cont || emptyWindows[index];
index ++;
}
// update criterion by simple incrementing
bool extra = true;
int pos = hardness-1;
while (extra && pos>=0){
if (criterion[pos]){
criterion[pos] = false;
}
else{
criterion[pos] = true;
extra = false;
}
pos--;
}
} while (cont);
}
size_t Sequence::hash(){
std::hash<vector<bool>> h;
return h(s);
}