1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | #include <sstream> |
19 | #include "hddm_mc_s.hpp" |
20 | |
21 | #ifndef _FILE_OFFSET_BITS64 |
22 | # define _FILE_OFFSET_BITS64 64 |
23 | #endif |
24 | |
25 | using namespace hddm_mc_s; |
26 | |
27 | static int tags_match(const std::string &a, const std::string &b) |
28 | { |
29 | if (a == b) { |
30 | return true; |
31 | } |
32 | else { |
33 | int len = a.length(); |
34 | int ia=0; |
35 | int ib=0; |
36 | for (; a[ia] == b[ib]; ++ia, ++ib, --len) {} |
37 | for (; a[ia] == ' '; ++ia, --len) {} |
38 | for (; a[ia] == '/'; ++ia, --len) {} |
39 | for (; b[ib] == ' '; ++ib) {} |
40 | for (; b[ib] == '/'; ++ib) {} |
41 | return (a.substr(ia) == b.substr(ib)); |
42 | } |
43 | } |
44 | |
45 | |
46 | streamposition::streamposition() |
47 | : block_start(), block_offset(), block_status() {} |
48 | |
49 | streamposition::streamposition(uint64_t start, uint32_t offset, uint32_t status) |
50 | : block_start(start), block_offset(offset), block_status(status) {} |
51 | |
52 | istream::istream(std::istream &src) |
53 | : m_xstr(0), |
54 | m_istr(src), |
55 | m_xcmp(0), |
56 | m_xraw(0), |
57 | m_status_bits(0) |
58 | { |
59 | char hdr[10]; |
60 | src.getline(hdr,7); |
61 | m_documentString = hdr; |
62 | if (m_documentString != "<HDDM ") { |
63 | throw std::runtime_error("hddm_mc_s::istream::istream error - invalid hddm header"); |
64 | } |
65 | src.clear(); |
66 | std::string line; |
67 | while (std::getline(src,line).good()) { |
68 | m_documentString += line + "\n"; |
69 | if (line == "</HDDM>") { |
70 | break; |
71 | } |
72 | } |
73 | if (src.bad()) { |
74 | throw std::runtime_error("hddm_mc_s::istream::istream error - invalid hddm header"); |
75 | } |
76 | m_genome.m_tagname = "HDDM"; |
77 | m_genome.m_sequence = synthesize(m_documentString,0,HDDM::DocumentString(),0); |
78 | m_event_buffer = new char[m_event_buffer_size = 100000]; |
79 | m_sbuf = new istreambuffer(m_event_buffer,m_event_buffer_size); |
80 | configure_streambufs(); |
81 | m_next_event_size = 0; |
82 | m_events_to_skip = 0; |
83 | m_records_read = 0; |
84 | m_bytes_read = 0; |
85 | } |
86 | |
87 | istream::~istream() { |
88 | if (m_xraw) { |
89 | m_istr.rdbuf(m_xraw); |
90 | } |
91 | if (m_xcmp) { |
92 | delete m_xcmp; |
93 | } |
94 | if (m_xstr) { |
95 | delete m_xstr; |
96 | } |
97 | if (m_sbuf) { |
98 | delete m_sbuf; |
99 | } |
100 | delete [] m_event_buffer; |
101 | } |
102 | |
103 | streamposition istream::getPosition() const { |
104 | streamposition pos; |
105 | pos.block_status = m_status_bits; |
106 | if (m_status_bits & (k_bz2_compression | k_z_compression)) { |
107 | if (m_status_bits & k_can_reposition) { |
108 | pos.block_start = m_xraw->pubseekoff(0,std::ios_base::cur, |
109 | std::ios_base::out); |
110 | if (m_status_bits & k_bz2_compression) { |
111 | pos.block_offset = dynamic_cast<xstream::bz::istreambuf*>(m_xcmp) |
112 | ->get_block_offset(); |
113 | pos.block_start -= dynamic_cast<xstream::bz::istreambuf*>(m_xcmp) |
114 | ->get_block_buffered(); |
115 | } |
116 | else { |
117 | pos.block_offset = dynamic_cast<xstream::z::istreambuf*>(m_xcmp) |
118 | ->get_block_offset(); |
119 | pos.block_start -= dynamic_cast<xstream::z::istreambuf*>(m_xcmp) |
120 | ->get_block_buffered(); |
121 | } |
122 | if (m_next_event_size > 0) |
123 | pos.block_offset -= std::streamoff(4); |
124 | } |
125 | else { |
126 | throw std::runtime_error("hddm_mc_s::istream::getPosition error - " |
127 | "old-format hddm input file does not support repositioning."); |
128 | } |
129 | } |
130 | else { |
131 | pos.block_start = m_istr.tellg(); |
132 | if (m_next_event_size > 0) |
133 | pos.block_start -= std::streamoff(4); |
134 | pos.block_offset = 0; |
135 | } |
136 | return pos; |
137 | } |
138 | |
139 | void istream::setPosition(const streamposition &pos) { |
140 | m_status_bits = pos.block_status; |
141 | if (m_status_bits & (k_bz2_compression | k_z_compression)) { |
| |
142 | if ((m_status_bits & k_can_reposition) == 0) { |
| |
143 | throw std::runtime_error("hddm_mc_s::istream::setPosition error - " |
144 | "old-format hddm input file does not support repositioning."); |
145 | } |
146 | if (m_xraw == 0 || pos.block_start != getPosition().block_start || |
147 | pos.block_offset < getPosition().block_offset) |
148 | { |
149 | m_bytes_read = 0; |
150 | configure_streambufs(); |
| 3 | | Calling 'istream::configure_streambufs' | |
|
| 7 | | Returning from 'istream::configure_streambufs' | |
|
151 | m_xraw->pubseekoff(pos.block_start,std::ios_base::beg, |
| 8 | | Called C++ object pointer is null |
|
152 | std::ios_base::in); |
153 | m_next_event_size = 0; |
154 | } |
155 | int advance; |
156 | while ((advance = pos.block_offset - getPosition().block_offset)) { |
157 | char tmpbuf[advance]; |
158 | m_xcmp->sgetn(tmpbuf, advance); |
159 | } |
160 | } |
161 | else if (pos.block_start != getPosition().block_start) { |
162 | m_istr.seekg(pos.block_start); |
163 | m_next_event_size = 0; |
164 | } |
165 | } |
166 | |
167 | void istream::configure_streambufs() { |
168 | if (m_xstr == 0) { |
| |
169 | m_xstr = new xstream::xdr::istream(m_sbuf); |
170 | } |
171 | if (m_xraw == 0 && (m_status_bits & k_z_compression) != 0) { |
172 | |
173 | m_xraw = m_istr.rdbuf(); |
174 | m_xcmp = new xstream::z::istreambuf(m_xraw); |
175 | m_istr.rdbuf(m_xcmp); |
176 | } |
177 | else if (m_xraw == 0 && (m_status_bits & k_bz2_compression) != 0) { |
178 | |
179 | m_xraw = m_istr.rdbuf(); |
180 | m_xcmp = new xstream::bz::istreambuf(m_xraw); |
181 | m_istr.rdbuf(m_xcmp); |
182 | } |
183 | else if (m_xraw == 0 && (m_status_bits & k_bits_compression) != 0) { |
184 | throw std::runtime_error("hddm_mc_s::istream::configure_streambufs error - " |
185 | "unrecognized compression flag requested."); |
186 | } |
187 | else if (m_xraw != 0) { |
| |
188 | m_istr.rdbuf(m_xraw); |
189 | delete m_xcmp; |
190 | m_xcmp = 0; |
191 | m_xraw = 0; |
| 6 | | Null pointer value stored to field 'm_xraw' | |
|
192 | configure_streambufs(); |
193 | } |
194 | } |
195 | |
196 | istream &istream::operator>>(HDDM &record) { |
197 | if (m_next_event_size == 0) { |
198 | m_istr.read(m_event_buffer,4); |
199 | m_bytes_read += m_istr.gcount(); |
200 | if (!m_istr.good()) { |
201 | throw std::runtime_error("hddm_mc_s::istream::operator>> error - " |
202 | "attempt to read past end of file!"); |
203 | } |
204 | m_sbuf->reset(); |
205 | *m_xstr >> m_next_event_size; |
206 | return *this >> record; |
207 | } |
208 | else if (m_next_event_size == 1) { |
209 | m_istr.read(m_event_buffer+4,4); |
210 | m_bytes_read += m_istr.gcount(); |
211 | if (!m_istr.good()) { |
212 | throw std::runtime_error("hddm_mc_s::istream::operator>> error -" |
213 | " read error on token input!"); |
214 | } |
215 | int size; |
216 | *m_xstr >> size; |
217 | m_istr.read(m_event_buffer+8,size); |
218 | m_bytes_read += m_istr.gcount(); |
219 | if (!m_istr.good()) { |
220 | throw std::runtime_error("hddm_mc_s::istream::operator>> error -" |
221 | " read error on token input!"); |
222 | } |
223 | int format, flags; |
224 | *m_xstr >> format >> flags; |
225 | if (format != 0) { |
226 | throw std::runtime_error("hddm_mc_s::istream::operator>> error - " |
227 | "unsupported compression format!"); |
228 | } |
229 | else if (flags != m_status_bits) { |
230 | int oldcmp = m_status_bits & k_bits_compression; |
231 | int newcmp = flags & k_bits_compression; |
232 | m_status_bits = flags; |
233 | if (oldcmp != newcmp) { |
234 | configure_streambufs(); |
235 | } |
236 | } |
237 | m_next_event_size = 0; |
238 | return *this >> record; |
239 | } |
240 | else if (m_next_event_size+8 > m_event_buffer_size) { |
241 | delete m_xstr; |
242 | delete m_sbuf; |
243 | char *newbuf = new char[m_event_buffer_size = m_next_event_size+1000]; |
244 | m_sbuf = new istreambuffer(newbuf, m_event_buffer_size); |
245 | m_xstr = new xstream::xdr::istream(m_sbuf); |
246 | memcpy(newbuf,m_event_buffer,4); |
247 | delete [] m_event_buffer; |
248 | m_event_buffer = newbuf; |
249 | } |
250 | |
251 | m_istr.read(m_event_buffer+4,m_next_event_size); |
252 | m_bytes_read += m_istr.gcount(); |
253 | m_records_read++; |
254 | if (!m_istr.good()) { |
255 | throw std::runtime_error("hddm_mc_s::istream::operator>> error -" |
256 | " read error in mid-record!"); |
257 | } |
258 | if ((m_status_bits & k_crc32_integrity) != 0) { |
259 | unsigned int recorded_crc; |
260 | char crcbuf[10]; |
261 | istreambuffer sbuf(crcbuf,10); |
262 | xstream::xdr::istream xstr(&sbuf); |
263 | m_istr.read(crcbuf,4); |
264 | m_bytes_read += m_istr.gcount(); |
265 | xstr >> recorded_crc; |
266 | xstream::digest::crc32 crc; |
267 | std::ostream out(&crc); |
268 | out.write(m_event_buffer,m_next_event_size+4); |
269 | out.flush(); |
270 | if (crc.digest() != recorded_crc) { |
271 | char errmsg[] = |
272 | "WARNING: crc data integrity check failed" |
273 | " on hddm_mc_s input stream!" |
274 | "\nThis may be the result of a bug in the" |
275 | " xstream library if you are analyzing a data" |
276 | " file that was generated by code prior to svn" |
277 | " rev 18530.\nIf this concerns you, regenerate" |
278 | " using a newer build of the sim-recon tools" |
279 | " and it should go away.\n"; |
280 | if ((m_status_bits & 0x02) == 0) { |
281 | std::cerr << errmsg << std::endl; |
282 | m_status_bits |= 0x02; |
283 | } |
284 | |
285 | |
286 | } |
287 | } |
288 | |
289 | if (m_events_to_skip) { |
290 | --m_events_to_skip; |
291 | m_next_event_size = 0; |
292 | return *this >> record; |
293 | } |
294 | m_sbuf->reset(); |
295 | m_sequencing = 0; |
296 | m_codon = &m_genome; |
297 | *this >> (streamable&)record; |
298 | m_istr.read(m_event_buffer,4); |
299 | m_bytes_read += m_istr.gcount(); |
300 | if (m_istr.eof()) { |
301 | m_next_event_size = 0; |
302 | } |
303 | else if (!m_istr.good()) { |
304 | throw std::runtime_error("hddm_mc_s::istream::operator>> error - " |
305 | "read error on event size!"); |
306 | } |
307 | else { |
308 | m_sbuf->reset(); |
309 | *m_xstr >> m_next_event_size; |
310 | } |
311 | return *this; |
312 | } |
313 | |
314 | ostream::ostream(std::ostream &src) |
315 | : m_xstr(0), |
316 | m_ostr(src), |
317 | m_xcmp(0), |
318 | m_xraw(0), |
319 | m_status_bits(k_default_status) |
320 | { |
321 | m_ostr << HDDM::DocumentString(); |
322 | if (!m_ostr.good()) { |
323 | throw std::runtime_error("hddm_mc_s::ostream::ostream(ostream) " |
324 | "error - write error on header output!"); |
325 | } |
326 | m_event_buffer = new char[m_event_buffer_size = 100000]; |
327 | m_sbuf = new ostreambuffer(m_event_buffer,m_event_buffer_size); |
328 | configure_streambufs(); |
329 | m_records_written = 0; |
330 | m_bytes_written = 0; |
331 | } |
332 | |
333 | ostream::~ostream() { |
334 | if (m_xstr) { |
335 | delete m_xstr; |
336 | } |
337 | if (m_xraw) { |
338 | m_ostr.flush(); |
339 | m_ostr.rdbuf(m_xraw); |
340 | } |
341 | if (m_xcmp) { |
342 | delete m_xcmp; |
343 | } |
344 | if (m_sbuf) { |
345 | delete m_sbuf; |
346 | } |
347 | delete [] m_event_buffer; |
348 | } |
349 | |
350 | void ostream::setCompression(int flags) { |
351 | if ((flags ^ m_status_bits) & k_bits_compression) { |
352 | m_status_bits &= ~k_bits_compression; |
353 | m_status_bits |= flags; |
354 | m_status_bits |= k_can_reposition; |
355 | m_sbuf->reset(); |
356 | *m_xstr << 1 << 8 << 0 << m_status_bits; |
357 | m_ostr.write(m_sbuf->getbuf(),m_sbuf->size()); |
358 | if (!m_ostr.good()) { |
359 | throw std::runtime_error("hddm_mc_s::ostream::setCompression" |
360 | " error - write error on token output!"); |
361 | } |
362 | configure_streambufs(); |
363 | } |
364 | } |
365 | |
366 | void ostream::setIntegrityChecks(int flags) { |
367 | if ((flags ^ m_status_bits) & k_bits_integrity) { |
368 | m_status_bits &= ~k_bits_integrity; |
369 | m_status_bits |= flags; |
370 | m_sbuf->reset(); |
371 | *m_xstr << 1 << 8 << 0 << m_status_bits; |
372 | m_ostr.write(m_sbuf->getbuf(),m_sbuf->size()); |
373 | if (!m_ostr.good()) { |
374 | throw std::runtime_error("hddm_mc_s::ostream::setIntegrityChecks error - " |
375 | "write error on token output!"); |
376 | } |
377 | } |
378 | } |
379 | |
380 | streamposition ostream::getPosition() const { |
381 | streamposition pos; |
382 | pos.block_status = m_status_bits; |
383 | if (m_status_bits & k_bz2_compression) { |
384 | pos.block_start = m_xraw->pubseekoff(0,std::ios_base::cur, |
385 | std::ios_base::out); |
386 | pos.block_offset = ((xstream::bz::istreambuf*)m_xcmp)->get_block_offset(); |
387 | } |
388 | else if (m_status_bits & k_z_compression) { |
389 | pos.block_start = m_xraw->pubseekoff(0,std::ios_base::cur, |
390 | std::ios_base::out); |
391 | pos.block_offset = ((xstream::z::istreambuf*)m_xcmp)->get_block_offset(); |
392 | } |
393 | else { |
394 | pos.block_start = m_ostr.tellp(); |
395 | pos.block_offset = 0; |
396 | } |
397 | return pos; |
398 | } |
399 | |
400 | void ostream::configure_streambufs() { |
401 | if (m_xstr == 0) { |
402 | m_xstr = new xstream::xdr::ostream(m_sbuf); |
403 | } |
404 | if (m_xraw == 0 && (m_status_bits & k_z_compression) != 0) { |
405 | |
406 | m_xraw = m_ostr.rdbuf(); |
407 | m_xcmp = new xstream::z::ostreambuf(m_xraw); |
408 | m_ostr.rdbuf(m_xcmp); |
409 | } |
410 | else if (m_xraw == 0 && (m_status_bits & k_bz2_compression) != 0) { |
411 | |
412 | m_xraw = m_ostr.rdbuf(); |
413 | m_xcmp = new xstream::bz::ostreambuf(m_xraw); |
414 | m_ostr.rdbuf(m_xcmp); |
415 | } |
416 | else if (m_xraw == 0 && (m_status_bits & k_bits_compression) != 0) { |
417 | throw std::runtime_error("hddm_mc_s::ostream::configure_streambufs error - " |
418 | "unrecognized compression flag requested."); |
419 | } |
420 | else if (m_xraw != 0) { |
421 | m_ostr.rdbuf(m_xraw); |
422 | delete m_xcmp; |
423 | m_xcmp = 0; |
424 | m_xraw = 0; |
425 | configure_streambufs(); |
426 | } |
427 | } |
428 | |
429 | int istream::getTag(const std::string &src, int start, |
430 | std::string &tag, int &level) |
431 | { |
432 | tag = ""; |
433 | size_t p_btag = src.find("<",start); |
434 | size_t p_bline = src.find_last_of("\n",p_btag); |
435 | if (p_bline == std::string::npos) |
436 | { |
437 | p_bline = 0; |
438 | } |
439 | else |
440 | { |
441 | ++p_bline; |
442 | } |
443 | level = (p_btag-p_bline)/2; |
444 | size_t p_etag = p_btag; |
445 | for (size_t quotes=0; p_etag < src.size(); ++p_etag) { |
446 | if (src[p_etag] == '"') { |
447 | tag += "\""; |
448 | ++quotes; |
449 | } |
450 | else if (quotes/2*2 != quotes) { |
451 | tag += src[p_etag]; |
452 | } |
453 | else if (src.find_first_of(" \t\n",p_etag) == 0) { |
454 | tag += " "; |
455 | p_etag = src.find_first_not_of(" \t\n",p_etag)-1; |
456 | } |
457 | else if (src[p_etag] == '>') { |
458 | tag += ">"; |
459 | break; |
460 | } |
461 | else { |
462 | tag += src[p_etag]; |
463 | } |
464 | } |
465 | if (p_etag == src.size()) { |
466 | std::stringstream sstr; |
467 | sstr << "hddm_mc_s::istream::getTag" |
468 | << " error - bad header format" << std::endl |
469 | << " tag " << tag << " at position " << start |
470 | << std::endl; |
471 | throw std::runtime_error(sstr.str()); |
472 | } |
473 | return p_etag+2; |
474 | } |
475 | |
476 | int istream::getEndTag(const std::string &src, int start, |
477 | const std::string &tag) |
478 | { |
479 | if (tag.rfind("/>") == tag.size()-2) { |
480 | return src.find(tag,start) + tag.size()+1; |
481 | } |
482 | else { |
483 | std::string etag = "</"; |
484 | etag += tag.substr(1,tag.find_first_of(' ')-1) + ">"; |
485 | size_t p_etag = src.find(etag,start); |
486 | size_t p_quote = src.find_first_of('"',start); |
487 | while (p_quote != std::string::npos && p_quote < p_etag) { |
488 | p_quote = src.find_first_of('"',p_quote+1); |
489 | if (p_quote > p_etag) { |
490 | p_etag = src.find(etag,p_quote+1); |
491 | } |
492 | p_quote = src.find_first_of('"',p_quote+1); |
493 | } |
494 | if (p_etag == std::string::npos) { |
495 | std::stringstream sstr; |
496 | sstr << "hddm_mc_s::istream::getEndTag" |
497 | << " error - bad header format" << std::endl |
498 | << " tag " << tag << " at position " << start |
499 | << std::endl |
500 | << " end tag " << etag << " not found." |
501 | << std::endl; |
502 | throw std::runtime_error(sstr.str()); |
503 | } |
504 | return p_etag + etag.size()+1; |
505 | } |
506 | } |
507 | |
508 | void istream::collide(const std::string &itag, const std::string &rtag) { |
509 | std::string itagname = itag.substr(1,itag.find(" ")-1); |
510 | std::string rtagname = rtag.substr(1,rtag.find(" ")-1); |
511 | std::string errmsg = "hddm_mc_s::istream::collide warning:\n" |
512 | "tag " + itagname + " in input file " |
513 | "does not match c++ header hddm_mc_s.hpp\n" |
514 | " input file: " + itag + "\n" |
515 | " c++ header: " + rtag + "\n" |
516 | " === Tag " + itagname + " will be ignored," |
517 | " rebuild to cure the problem ==="; |
518 | if (itagname != "HDDM") { |
519 | std::cerr << errmsg << std::endl; |
520 | } |
521 | else { |
522 | throw std::runtime_error(errmsg); |
523 | } |
524 | } |
525 | |
526 | chromosome istream::synthesize(const std::string &src, int p_src, |
527 | const std::string &ref, int p_ref) |
528 | { |
529 | chromosome chrom; |
530 | int slevel, rlevel; |
531 | std::string stag, rtag; |
532 | p_src = getTag(src,p_src,stag,slevel); |
533 | p_ref = getTag(ref,p_ref,rtag,rlevel); |
534 | std::string stagname = stag.substr(1,stag.find(" ")-1); |
535 | std::string rtagname = rtag.substr(1,rtag.find(" ")-1); |
536 | if (stagname != rtagname) { |
537 | throw std::runtime_error("hddm_mc_s::istream::synthesize error - matching algorithm error #2"); |
538 | } |
539 | else if (!tags_match(stag,rtag)) { |
540 | collide(stag,rtag); |
541 | return chrom; |
542 | } |
543 | |
544 | int p2_src, p2_ref; |
545 | int s2level, r2level; |
546 | std::string s2tag, r2tag; |
547 | getTag(src,p2_src=p_src,s2tag,s2level); |
548 | while (s2level > slevel) { |
549 | codon *gene = new codon(); |
550 | std::string s2tagname = s2tag.substr(1,s2tag.find(" ")-1); |
551 | getTag(ref,p2_ref=p_ref,r2tag,r2level); |
552 | int order_of_this_tag_in_ref = 1; |
553 | while (r2level == s2level) { |
554 | std::string r2tagname = r2tag.substr(1,r2tag.find(" ")-1); |
555 | if (s2tagname == r2tagname) { |
556 | if (!tags_match(s2tag,r2tag)) { |
557 | collide(s2tag,r2tag); |
558 | break; |
559 | } |
560 | else { |
561 | gene->m_order = order_of_this_tag_in_ref; |
562 | } |
563 | gene->m_sequence = synthesize(src,p2_src,ref,p2_ref); |
564 | break; |
565 | } |
566 | p2_ref = getEndTag(ref,p2_ref,r2tag); |
567 | getTag(ref,p2_ref,r2tag,r2level); |
568 | ++order_of_this_tag_in_ref; |
569 | } |
570 | gene->m_tagname = s2tagname; |
571 | chrom.push_back(*gene); |
572 | delete gene; |
573 | p2_src = getEndTag(src,p2_src,s2tag); |
574 | getTag(src,p2_src,s2tag,s2level); |
575 | } |
576 | return chrom; |
577 | } |