SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <bit>
16#include <iterator>
17#include <ranges>
18#include <string>
19#include <vector>
20
33
34namespace seqan3
35{
36
50{
51public:
55 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
56 format_bam() = default;
57 format_bam(format_bam const &) = default;
58 format_bam & operator=(format_bam const &) = default;
59 format_bam(format_bam &&) = default;
61 ~format_bam() = default;
62
64
67
68protected:
69 template <typename stream_type, // constraints checked by file
70 typename seq_legal_alph_type,
71 typename ref_seqs_type,
72 typename ref_ids_type,
73 typename stream_pos_type,
74 typename seq_type,
75 typename id_type,
76 typename offset_type,
77 typename ref_seq_type,
78 typename ref_id_type,
79 typename ref_offset_type,
80 typename cigar_type,
81 typename flag_type,
82 typename mapq_type,
83 typename qual_type,
84 typename mate_type,
85 typename tag_dict_type,
86 typename e_value_type,
87 typename bit_score_type>
88 void read_alignment_record(stream_type & stream,
89 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
90 ref_seqs_type & ref_seqs,
92 stream_pos_type & position_buffer,
93 seq_type & seq,
94 qual_type & qual,
95 id_type & id,
96 offset_type & offset,
97 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
98 ref_id_type & ref_id,
99 ref_offset_type & ref_offset,
100 cigar_type & cigar_vector,
101 flag_type & flag,
102 mapq_type & mapq,
103 mate_type & mate,
104 tag_dict_type & tag_dict,
105 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
106 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
107
108 template <typename stream_type,
109 typename header_type,
110 typename seq_type,
111 typename id_type,
112 typename ref_seq_type,
113 typename ref_id_type,
114 typename cigar_type,
115 typename qual_type,
116 typename mate_type,
117 typename tag_dict_type>
118 void write_alignment_record([[maybe_unused]] stream_type & stream,
119 [[maybe_unused]] sam_file_output_options const & options,
120 [[maybe_unused]] header_type && header,
121 [[maybe_unused]] seq_type && seq,
122 [[maybe_unused]] qual_type && qual,
123 [[maybe_unused]] id_type && id,
124 [[maybe_unused]] int32_t const offset,
125 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
126 [[maybe_unused]] ref_id_type && ref_id,
127 [[maybe_unused]] std::optional<int32_t> ref_offset,
128 [[maybe_unused]] cigar_type && cigar_vector,
129 [[maybe_unused]] sam_flag const flag,
130 [[maybe_unused]] uint8_t const mapq,
131 [[maybe_unused]] mate_type && mate,
132 [[maybe_unused]] tag_dict_type && tag_dict,
133 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
134 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
135
136private:
138 bool header_was_read{false};
139
142
145 { // naming corresponds to official SAM/BAM specifications
146 int32_t block_size;
147 int32_t refID;
148 int32_t pos;
149 uint32_t l_read_name : 8;
150 uint32_t mapq : 8;
151 uint32_t bin : 16;
152 uint32_t n_cigar_op : 16;
154 int32_t l_seq;
155 int32_t next_refID;
156 int32_t next_pos;
157 int32_t tlen;
158 };
159
160 // clang-format off
163 {
164 []() constexpr {
166
167 using index_t = std::make_unsigned_t<char>;
168
169 // ret['M'] = 0; set anyway by initialization
170 ret[static_cast<index_t>('I')] = 1;
171 ret[static_cast<index_t>('D')] = 2;
172 ret[static_cast<index_t>('N')] = 3;
173 ret[static_cast<index_t>('S')] = 4;
174 ret[static_cast<index_t>('H')] = 5;
175 ret[static_cast<index_t>('P')] = 6;
176 ret[static_cast<index_t>('=')] = 7;
177 ret[static_cast<index_t>('X')] = 8;
178
179 return ret;
180 }()
181 };
182 // clang-format on
183
185 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
186 {
187 --end;
188 if (beg >> 14 == end >> 14)
189 return ((1 << 15) - 1) / 7 + (beg >> 14);
190 if (beg >> 17 == end >> 17)
191 return ((1 << 12) - 1) / 7 + (beg >> 17);
192 if (beg >> 20 == end >> 20)
193 return ((1 << 9) - 1) / 7 + (beg >> 20);
194 if (beg >> 23 == end >> 23)
195 return ((1 << 6) - 1) / 7 + (beg >> 23);
196 if (beg >> 26 == end >> 26)
197 return ((1 << 3) - 1) / 7 + (beg >> 26);
198 return 0;
199 }
200
207 template <typename stream_view_type, std::integral number_type>
208 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
209 {
210 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
211 }
212
218 template <typename stream_view_type>
219 void read_float_byte_field(stream_view_type && stream_view, float & target)
220 {
221 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
222 }
223
224 template <typename stream_view_type, typename value_type>
226 stream_view_type && stream_view,
227 value_type const & SEQAN3_DOXYGEN_ONLY(value));
228
229 template <typename stream_view_type>
230 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
231
232 template <typename cigar_input_type>
233 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
234
235 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
236};
237
239template <typename stream_type, // constraints checked by file
240 typename seq_legal_alph_type,
241 typename ref_seqs_type,
242 typename ref_ids_type,
243 typename stream_pos_type,
244 typename seq_type,
245 typename id_type,
246 typename offset_type,
247 typename ref_seq_type,
248 typename ref_id_type,
249 typename ref_offset_type,
250 typename cigar_type,
251 typename flag_type,
252 typename mapq_type,
253 typename qual_type,
254 typename mate_type,
255 typename tag_dict_type,
256 typename e_value_type,
257 typename bit_score_type>
258inline void
260 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
261 ref_seqs_type & ref_seqs,
263 stream_pos_type & position_buffer,
264 seq_type & seq,
265 qual_type & qual,
266 id_type & id,
267 offset_type & offset,
268 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
269 ref_id_type & ref_id,
270 ref_offset_type & ref_offset,
271 cigar_type & cigar_vector,
272 flag_type & flag,
273 mapq_type & mapq,
274 mate_type & mate,
275 tag_dict_type & tag_dict,
276 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
277 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
278{
279 static_assert(detail::decays_to_ignore_v<ref_offset_type>
280 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
281 "The ref_offset must be a specialisation of std::optional.");
282
283 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
284 "The type of field::mapq must be uint8_t.");
285
286 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
287 "The type of field::flag must be seqan3::sam_flag.");
288
289 auto stream_view = seqan3::detail::istreambuf(stream);
290
291 [[maybe_unused]] int32_t offset_tmp{}; // needed in case the cigar string was stored in the tag dictionary
292 [[maybe_unused]] int32_t ref_length{0}; // needed in case the cigar string was stored in the tag dictionary
293
294 // Header
295 // -------------------------------------------------------------------------------------------------------------
296 if (!header_was_read)
297 {
298 // magic BAM string
300 throw format_error{"File is not in BAM format."};
301
302 int32_t l_text{}; // length of header text including \0 character
303 int32_t n_ref{}; // number of reference sequences
304 int32_t l_name{}; // 1 + length of reference name including \0 character
305 int32_t l_ref{}; // length of reference sequence
306
307 read_integral_byte_field(stream_view, l_text);
308
309 if (l_text > 0) // header text is present
310 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
311
312 read_integral_byte_field(stream_view, n_ref);
313
314 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
315 {
316 read_integral_byte_field(stream_view, l_name);
317
318 string_buffer.resize(l_name - 1);
320 l_name - 1,
321 string_buffer.data()); // copy without \0 character
322 ++std::ranges::begin(stream_view); // skip \0 character
323
324 read_integral_byte_field(stream_view, l_ref);
325
326 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
327 {
328 // If there was no header text, we parse reference sequences block as header information
329 if (l_text == 0)
330 {
331 auto & reference_ids = header.ref_ids();
332 // put the length of the reference sequence into ref_id_info
333 header.ref_id_info.emplace_back(l_ref, "");
334 // put the reference name into reference_ids
335 reference_ids.push_back(string_buffer);
336 // assign the reference name an ascending reference id (starts at index 0).
337 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
338 continue;
339 }
340 }
341
342 auto id_it = header.ref_dict.find(string_buffer);
343
344 // sanity checks of reference information to existing header object:
345 if (id_it == header.ref_dict.end()) // [unlikely]
346 {
347 throw format_error{detail::to_string("Unknown reference name '" + string_buffer
348 + "' found in BAM file header (header.ref_ids():",
349 header.ref_ids(),
350 ").")};
351 }
352 else if (id_it->second != ref_idx) // [unlikely]
353 {
354 throw format_error{detail::to_string("Reference id '",
356 "' at position ",
357 ref_idx,
358 " does not correspond to the position ",
359 id_it->second,
360 " in the header (header.ref_ids():",
361 header.ref_ids(),
362 ").")};
363 }
364 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
365 {
366 throw format_error{"Provided reference has unequal length as specified in the header."};
367 }
368 }
369
370 header_was_read = true;
371
372 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
373 return;
374 }
375
376 // read alignment record into buffer
377 // -------------------------------------------------------------------------------------------------------------
378 position_buffer = stream.tellg();
380 std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
381
382 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
383 {
384 throw format_error{detail::to_string("Reference id index '",
385 core.refID,
386 "' is not in range of ",
387 "header.ref_ids(), which has size ",
388 header.ref_ids().size(),
389 ".")};
390 }
391 else if (core.refID > -1) // not unmapped
392 {
393 ref_id = core.refID; // field::ref_id
394 }
395
396 flag = core.flag; // field::flag
397 mapq = core.mapq; // field::mapq
398
399 if (core.pos > -1) // [[likely]]
400 ref_offset = core.pos; // field::ref_offset
401
402 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
403 {
404 if (core.next_refID > -1)
405 get<0>(mate) = core.next_refID;
406
407 if (core.next_pos > -1) // [[likely]]
408 get<1>(mate) = core.next_pos;
409
410 get<2>(mate) = core.tlen;
411 }
412
413 // read id
414 // -------------------------------------------------------------------------------------------------------------
415 auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
416 if constexpr (!detail::decays_to_ignore_v<id_type>)
417 read_forward_range_field(id_view, id); // field::id
418 else
419 detail::consume(id_view);
420 ++std::ranges::begin(stream_view); // skip '\0'
421
422 // read cigar string
423 // -------------------------------------------------------------------------------------------------------------
424 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
425 {
426 int32_t seq_length{0};
427 std::tie(cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
428 int32_t soft_clipping_end{};
429 transfer_soft_clipping_to(cigar_vector, offset_tmp, soft_clipping_end);
430 }
431 else
432 {
434 }
435
436 offset = offset_tmp;
437
438 // read sequence
439 // -------------------------------------------------------------------------------------------------------------
440 if (core.l_seq > 0) // sequence information is given
441 {
442 auto seq_stream = stream_view | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
445 {
446 return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
447 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
448 });
449
450 if constexpr (detail::decays_to_ignore_v<seq_type>)
451 {
452 auto skip_sequence_bytes = [&]()
453 {
454 detail::consume(seq_stream);
455 if (core.l_seq & 1)
456 ++std::ranges::begin(stream_view);
457 };
458
459 skip_sequence_bytes();
460 }
461 else
462 {
463 using alph_t = std::ranges::range_value_t<decltype(seq)>;
464 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
465
466 for (auto [d1, d2] : seq_stream)
467 {
468 seq.push_back(from_dna16[to_rank(d1)]);
469 seq.push_back(from_dna16[to_rank(d2)]);
470 }
471
472 if (core.l_seq & 1)
473 {
474 dna16sam d =
475 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
476 seq.push_back(from_dna16[to_rank(d)]);
477 ++std::ranges::begin(stream_view);
478 }
479 }
480 }
481
482 // read qual string
483 // -------------------------------------------------------------------------------------------------------------
484 auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
486 [](char chr)
487 {
488 return static_cast<char>(chr + 33);
489 });
490 if constexpr (!detail::decays_to_ignore_v<qual_type>)
491 read_forward_range_field(qual_view, qual);
492 else
493 detail::consume(qual_view);
494
495 // All remaining optional fields if any: SAM tags dictionary
496 // -------------------------------------------------------------------------------------------------------------
497 int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4 /*block_size excluded*/)
498 - core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
499 assert(remaining_bytes >= 0);
500 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
501
502 while (tags_view.size() > 0)
503 {
504 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
505 read_sam_dict_field(tags_view, tag_dict);
506 else
507 detail::consume(tags_view);
508 }
509
510 // DONE READING - wrap up
511 // -------------------------------------------------------------------------------------------------------------
512 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
513 {
514 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
515 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
516 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
517 if (core.l_seq != 0 && offset_tmp == core.l_seq)
518 {
519 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
520 { // maybe only throw in debug mode and otherwise return an empty alignment?
521 throw format_error{
522 detail::to_string("The cigar string '",
523 offset_tmp,
524 "S",
525 ref_length,
526 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
527 "stored in the optional field CG. You need to read in the field::tags and "
528 "field::seq in order to access this information.")};
529 }
530 else
531 {
532 auto it = tag_dict.find("CG"_tag);
533
534 if (it == tag_dict.end())
536 "The cigar string '",
537 offset_tmp,
538 "S",
539 ref_length,
540 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
541 "stored in the optional field CG but this tag is not present in the given ",
542 "record.")};
543
544 auto cigar_view = std::views::all(std::get<std::string>(it->second));
545 int32_t seq_length{0};
546 std::tie(cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
547 offset_tmp = 0;
548 int32_t soft_clipping_end{};
549 transfer_soft_clipping_to(cigar_vector, offset_tmp, soft_clipping_end);
550 tag_dict.erase(it); // remove redundant information
551 }
552 }
553 }
554}
555
557template <typename stream_type,
558 typename header_type,
559 typename seq_type,
560 typename id_type,
561 typename ref_seq_type,
562 typename ref_id_type,
563 typename cigar_type,
564 typename qual_type,
565 typename mate_type,
566 typename tag_dict_type>
567inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
568 [[maybe_unused]] sam_file_output_options const & options,
569 [[maybe_unused]] header_type && header,
570 [[maybe_unused]] seq_type && seq,
571 [[maybe_unused]] qual_type && qual,
572 [[maybe_unused]] id_type && id,
573 [[maybe_unused]] int32_t const offset,
574 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
575 [[maybe_unused]] ref_id_type && ref_id,
576 [[maybe_unused]] std::optional<int32_t> ref_offset,
577 [[maybe_unused]] cigar_type && cigar_vector,
578 [[maybe_unused]] sam_flag const flag,
579 [[maybe_unused]] uint8_t const mapq,
580 [[maybe_unused]] mate_type && mate,
581 [[maybe_unused]] tag_dict_type && tag_dict,
582 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
583 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
584{
585 // ---------------------------------------------------------------------
586 // Type Requirements (as static asserts for user friendliness)
587 // ---------------------------------------------------------------------
588 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
589 "The seq object must be a std::ranges::forward_range over "
590 "letters that model seqan3::alphabet.");
591
592 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
593 "The id object must be a std::ranges::forward_range over "
594 "letters that model seqan3::alphabet.");
595
596 static_assert((std::ranges::forward_range<ref_seq_type> && alphabet<std::ranges::range_reference_t<ref_seq_type>>),
597 "The ref_seq object must be a std::ranges::forward_range "
598 "over letters that model seqan3::alphabet.");
599
600 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
601 {
602 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
603 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
604 "The ref_id object must be a std::ranges::forward_range "
605 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
606 }
607
608 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
609 "The qual object must be a std::ranges::forward_range "
610 "over letters that model seqan3::alphabet.");
611
613 "The mate object must be a std::tuple of size 3 with "
614 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
615 "2) a std::integral or std::optional<std::integral>, and "
616 "3) a std::integral.");
617
618 static_assert(
619 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
620 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
621 || detail::is_type_specialisation_of_v<
622 std::remove_cvref_t<decltype(std::get<0>(mate))>,
623 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
624 || detail::is_type_specialisation_of_v<
625 std::remove_cvref_t<decltype(std::get<1>(mate))>,
626 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
627 "The mate object must be a std::tuple of size 3 with "
628 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
629 "2) a std::integral or std::optional<std::integral>, and "
630 "3) a std::integral.");
631
632 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
633 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
634
635 if constexpr (detail::decays_to_ignore_v<header_type>)
636 {
637 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
638 "You can either construct the output file with ref_ids and ref_seqs information and "
639 "the header will be created for you, or you can access the `header` member directly."};
640 }
641 else
642 {
643 // ---------------------------------------------------------------------
644 // logical Requirements
645 // ---------------------------------------------------------------------
646
647 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
648 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
649
650 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
651
652 // ---------------------------------------------------------------------
653 // Writing the BAM Header on first call
654 // ---------------------------------------------------------------------
656 {
657 stream << "BAM\1";
659 write_header(os, options, header); // write SAM header to temporary stream to query the size.
660 int32_t l_text{static_cast<int32_t>(os.str().size())};
661 std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
662
663 stream << os.str();
664
665 int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
666 std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
667
668 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
669 {
670 int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
671 std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
672 // write reference name:
673 std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
674 stream_it = '\0';
675 // write reference sequence length:
676 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
677 }
678
679 header_was_written = true;
680 }
681
682 // ---------------------------------------------------------------------
683 // Writing the Record
684 // ---------------------------------------------------------------------
685 int32_t ref_length{};
686
687 // Compute the ref_length from given cigar_vector which is needed to fill field `bin`.
688 if (!std::ranges::empty(cigar_vector))
689 {
690 int32_t dummy_seq_length{};
691 for (auto & [count, operation] : cigar_vector)
692 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
693 }
694
695 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
696 {
697 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
698 cigar_vector.resize(2);
699 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
700 cigar_vector[1] = cigar{static_cast<uint32_t>(ref_length), 'N'_cigar_operation};
701 }
702
703 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
704
705 // Compute the value for the l_read_name field for the bam record.
706 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
707 // the data type to store the value is uint8_t and 255 is the maximal size.
708 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
709 // 2 bytes.
710 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
711 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
712
713 alignment_record_core core{/* block_size */ 0, // will be initialised right after
714 /* refID */ -1, // will be initialised right after
715 /* pos */ ref_offset.value_or(-1),
716 /* l_read_name */ read_name_size,
717 /* mapq */ mapq,
718 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
719 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
720 /* flag */ flag,
721 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
722 /* next_refId */ -1, // will be initialised right after
723 /* next_pos */ get<1>(mate).value_or(-1),
724 /* tlen */ get<2>(mate)};
725
726 auto check_and_assign_id_to = [&header]([[maybe_unused]] auto & id_source, [[maybe_unused]] auto & id_target)
727 {
728 using id_t = std::remove_reference_t<decltype(id_source)>;
729
730 if constexpr (!detail::decays_to_ignore_v<id_t>)
731 {
732 if constexpr (std::integral<id_t>)
733 {
734 id_target = id_source;
735 }
736 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
737 {
738 id_target = id_source.value_or(-1);
739 }
740 else
741 {
742 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
743 {
744 auto id_it = header.ref_dict.end();
745
746 if constexpr (std::ranges::contiguous_range<decltype(id_source)>
747 && std::ranges::sized_range<decltype(id_source)>
748 && std::ranges::borrowed_range<decltype(id_source)>)
749 {
750 id_it = header.ref_dict.find(
751 std::span{std::ranges::data(id_source), std::ranges::size(id_source)});
752 }
753 else
754 {
755 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
756
757 static_assert(
758 implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
759 "The ref_id type is not convertible to the reference id information stored in the "
760 "reference dictionary of the header object.");
761
762 id_it = header.ref_dict.find(id_source);
763 }
764
765 if (id_it == header.ref_dict.end())
766 {
767 throw format_error{detail::to_string("Unknown reference name '",
768 id_source,
769 "' could "
770 "not be found in BAM header ref_dict: ",
771 header.ref_dict,
772 ".")};
773 }
774
775 id_target = id_it->second;
776 }
777 }
778 }
779 };
780
781 // initialise core.refID
782 check_and_assign_id_to(ref_id, core.refID);
783
784 // initialise core.next_refID
785 check_and_assign_id_to(get<0>(mate), core.next_refID);
786
787 // initialise core.block_size
788 core.block_size = sizeof(core) - 4 /*block_size excluded*/ + core.l_read_name + core.n_cigar_op * 4
789 + // each int32_t has 4 bytes
790 (core.l_seq + 1) / 2 + // bitcompressed seq
791 core.l_seq + // quality string
792 tag_dict_binary_str.size();
793
794 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
795
796 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
797 stream_it = '*';
798 else
799 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
800 stream_it = '\0';
801
802 // write cigar
803 for (auto [cigar_count, op] : cigar_vector)
804 {
805 cigar_count = cigar_count << 4;
806 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
807 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
808 }
809
810 // write seq (bit-compressed: dna16sam characters go into one byte)
811 using alph_t = std::ranges::range_value_t<seq_type>;
812 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
813
814 auto sit = std::ranges::begin(seq);
815 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
816 {
817 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
818 ++sidx, ++sit;
819 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
820 stream_it = static_cast<char>(compressed_chr);
821 }
822
823 if (core.l_seq & 1) // write one more
824 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
825
826 // write qual
827 if (std::ranges::empty(qual))
828 {
829 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
830 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
831 }
832 else
833 {
834 if (std::ranges::distance(qual) != core.l_seq)
835 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
836 core.l_seq,
837 ". Got quality with size ",
838 std::ranges::distance(qual),
839 " instead.")};
840
841 auto v = qual
843 [](auto chr)
844 {
845 return static_cast<char>(to_rank(chr));
846 });
847 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
848 }
849
850 // write optional fields
851 stream << tag_dict_binary_str;
852 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
853}
854
856template <typename stream_view_type, typename value_type>
858 stream_view_type && stream_view,
859 value_type const & SEQAN3_DOXYGEN_ONLY(value))
860{
861 int32_t count;
862 read_integral_byte_field(stream_view, count); // read length of vector
863 std::vector<value_type> tmp_vector;
864 tmp_vector.reserve(count);
865
866 while (count > 0)
867 {
868 value_type tmp{};
869 if constexpr (std::integral<value_type>)
870 {
871 read_integral_byte_field(stream_view, tmp);
872 }
873 else if constexpr (std::same_as<value_type, float>)
874 {
875 read_float_byte_field(stream_view, tmp);
876 }
877 else
878 {
879 constexpr bool always_false = std::is_same_v<value_type, void>;
880 static_assert(always_false, "format_bam::read_sam_dict_vector: unsupported value_type");
881 }
882 tmp_vector.push_back(std::move(tmp));
883 --count;
884 }
885 variant = std::move(tmp_vector);
886}
887
905template <typename stream_view_type>
906inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
907{
908 /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
909 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
910 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
911 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
912 by the length (int32_t) of the array, followed by the values.
913 */
914 auto it = std::ranges::begin(stream_view);
915 uint16_t tag = static_cast<uint16_t>(*it) << 8;
916 ++it; // skip char read before
917
918 tag += static_cast<uint16_t>(*it);
919 ++it; // skip char read before
920
921 char type_id = *it;
922 ++it; // skip char read before
923
924 switch (type_id)
925 {
926 case 'A': // char
927 {
928 target[tag] = *it;
929 ++it; // skip char that has been read
930 break;
931 }
932 // all integer sizes are possible
933 case 'c': // int8_t
934 {
935 int8_t tmp;
936 read_integral_byte_field(stream_view, tmp);
937 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
938 break;
939 }
940 case 'C': // uint8_t
941 {
942 uint8_t tmp;
943 read_integral_byte_field(stream_view, tmp);
944 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
945 break;
946 }
947 case 's': // int16_t
948 {
949 int16_t tmp;
950 read_integral_byte_field(stream_view, tmp);
951 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
952 break;
953 }
954 case 'S': // uint16_t
955 {
956 uint16_t tmp;
957 read_integral_byte_field(stream_view, tmp);
958 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
959 break;
960 }
961 case 'i': // int32_t
962 {
963 int32_t tmp;
964 read_integral_byte_field(stream_view, tmp);
965 target[tag] = std::move(tmp); // readable sam format only allows int32_t
966 break;
967 }
968 case 'I': // uint32_t
969 {
970 uint32_t tmp;
971 read_integral_byte_field(stream_view, tmp);
972 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
973 break;
974 }
975 case 'f': // float
976 {
977 float tmp;
978 read_float_byte_field(stream_view, tmp);
979 target[tag] = tmp;
980 break;
981 }
982 case 'Z': // string
983 {
985 while (!is_char<'\0'>(*it))
986 {
988 ++it;
989 }
990 ++it; // skip \0
991 target[tag] = string_buffer;
992 break;
993 }
994 case 'H': // byte array, represented as null-terminated string; specification requires even number of bytes
995 {
996 std::vector<std::byte> byte_array;
997 std::byte value;
998 while (!is_char<'\0'>(*it))
999 {
1002 ++it;
1003
1004 if (*it == '\0')
1005 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1006
1008 ++it;
1010 byte_array.push_back(value);
1011 }
1012 ++it; // skip \0
1013 target[tag] = byte_array;
1014 break;
1015 }
1016 case 'B': // Array. Value type depends on second char [cCsSiIf]
1017 {
1018 char array_value_type_id = *it;
1019 ++it; // skip char read before
1020
1021 switch (array_value_type_id)
1022 {
1023 case 'c': // int8_t
1024 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1025 break;
1026 case 'C': // uint8_t
1027 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1028 break;
1029 case 's': // int16_t
1030 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1031 break;
1032 case 'S': // uint16_t
1033 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1034 break;
1035 case 'i': // int32_t
1036 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1037 break;
1038 case 'I': // uint32_t
1039 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1040 break;
1041 case 'f': // float
1042 read_sam_dict_vector(target[tag], stream_view, float{});
1043 break;
1044 default:
1045 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1046 "must be one of [cCsSiIf] but '",
1047 array_value_type_id,
1048 "' was given.")};
1049 }
1050 break;
1051 }
1052 default:
1053 throw format_error{detail::to_string("The second character in the numerical id of a "
1054 "SAM tag must be one of [A,i,Z,H,B,f] but '",
1055 type_id,
1056 "' was given.")};
1057 }
1058}
1059
1074template <typename cigar_input_type>
1075inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1076{
1077 std::vector<cigar> operations{};
1078 char operation{'\0'};
1079 uint32_t count{};
1080 int32_t ref_length{}, seq_length{};
1081 uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1082 constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1083 constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1084
1085 if (n_cigar_op == 0) // [[unlikely]]
1086 return std::tuple{operations, ref_length, seq_length};
1087
1088 // parse the rest of the cigar
1089 // -------------------------------------------------------------------------------------------------------------
1090 while (n_cigar_op > 0) // until stream is not empty
1091 {
1093 sizeof(operation_and_count),
1094 reinterpret_cast<char *>(&operation_and_count));
1095 operation = cigar_mapping[operation_and_count & cigar_mask];
1096 count = operation_and_count >> 4;
1097
1098 detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1099 operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1100 --n_cigar_op;
1101 }
1102
1103 return std::tuple{operations, ref_length, seq_length};
1104}
1105
1110{
1111 std::string result{};
1112
1113 auto stream_variant_fn = [&result](auto && arg) // helper to print a std::variant
1114 {
1115 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1116 using T = std::remove_cvref_t<decltype(arg)>;
1117
1118 if constexpr (std::same_as<T, int32_t>)
1119 {
1120 // always choose the smallest possible representation [cCsSiI]
1121 size_t const absolute_arg = std::abs(arg);
1122 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1123 bool const negative = arg < 0;
1124 n = n * n + 2 * negative; // for switch case order
1125
1126 switch (n)
1127 {
1128 case 0:
1129 {
1130 result[result.size() - 1] = 'C';
1131 result.append(reinterpret_cast<char const *>(&arg), 1);
1132 break;
1133 }
1134 case 1:
1135 {
1136 result[result.size() - 1] = 'S';
1137 result.append(reinterpret_cast<char const *>(&arg), 2);
1138 break;
1139 }
1140 case 2:
1141 {
1142 result[result.size() - 1] = 'c';
1143 int8_t tmp = static_cast<int8_t>(arg);
1144 result.append(reinterpret_cast<char const *>(&tmp), 1);
1145 break;
1146 }
1147 case 3:
1148 {
1149 result[result.size() - 1] = 's';
1150 int16_t tmp = static_cast<int16_t>(arg);
1151 result.append(reinterpret_cast<char const *>(&tmp), 2);
1152 break;
1153 }
1154 default:
1155 {
1156 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1157 break;
1158 }
1159 }
1160 }
1161 else if constexpr (std::same_as<T, std::string>)
1162 {
1163 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1 /*+ null character*/);
1164 }
1165 else if constexpr (!std::ranges::range<T>) // char, float
1166 {
1167 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1168 }
1169 else // std::vector of some arithmetic_type type
1170 {
1171 int32_t sz{static_cast<int32_t>(arg.size())};
1172 result.append(reinterpret_cast<char *>(&sz), 4);
1173 result.append(reinterpret_cast<char const *>(arg.data()),
1174 arg.size() * sizeof(std::ranges::range_value_t<T>));
1175 }
1176 };
1177
1178 for (auto & [tag, variant] : tag_dict)
1179 {
1180 result.push_back(static_cast<char>(tag / 256));
1181 result.push_back(static_cast<char>(tag % 256));
1182
1183 result.push_back(detail::sam_tag_type_char[variant.index()]);
1184
1185 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1186 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1187
1188 std::visit(stream_variant_fn, variant);
1189 }
1190
1191 return result;
1192}
1193
1194} // namespace seqan3
T begin(T... args)
T bit_ceil(T... args)
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:163
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:187
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: alphabet/cigar/cigar.hpp:60
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:40
The alignment base format.
Definition: format_sam_base.hpp:45
void write_header(stream_t &stream, sam_file_output_options const &options, sam_file_header< ref_ids_type > &header)
Writes the SAM header.
Definition: format_sam_base.hpp:636
void transfer_soft_clipping_to(std::vector< cigar > const &cigar_vector, int32_t &sc_begin, int32_t &sc_end) const
Transfer soft clipping information from the cigar_vector to sc_begin and sc_end.
Definition: format_sam_base.hpp:151
void read_forward_range_field(stream_view_type &&stream_view, target_range_type &target)
Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to.
Definition: format_sam_base.hpp:232
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition: format_sam_base.hpp:66
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &)
Reads the SAM header.
Definition: format_sam_base.hpp:300
void read_byte_field(stream_view_t &&stream_view, std::byte &byte_target)
Reads std::byte fields using std::from_chars.
Definition: format_sam_base.hpp:204
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The actual implementation of seqan3::cigar::operation for documentation purposes only....
Definition: cigar_operation.hpp:48
The BAM format.
Definition: format_bam.hpp:50
void read_sam_dict_field(stream_view_type &&stream_view, sam_tag_dictionary &target)
Reads the optional tag fields into the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:906
format_bam()=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:259
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
void read_integral_byte_field(stream_view_type &&stream_view, number_type &target)
Reads a arithmetic field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:208
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
auto parse_binary_cigar(cigar_input_type &&cigar_input, uint16_t n_cigar_op) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_bam.hpp:1075
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:567
void read_float_byte_field(stream_view_type &&stream_view, float &target)
Reads a float field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:219
static std::string get_tag_dict_str(sam_tag_dictionary const &tag_dict)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:1109
std::string string_buffer
Local buffer to read into while avoiding reallocation.
Definition: format_bam.hpp:141
static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
Definition: format_bam.hpp:185
void read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view, value_type const &value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition: format_bam.hpp:857
bool header_was_read
A variable that tracks whether the content of header has been read or not.
Definition: format_bam.hpp:138
static constexpr std::array< uint8_t, 256 > char_to_sam_rank
Converts a cigar op character to the rank according to the official BAM specifications.
Definition: format_bam.hpp:163
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:66
Stores the header information of alignment files.
Definition: header.hpp:34
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:143
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:182
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:179
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
T clear(T... args)
T copy(T... args)
T copy_n(T... args)
T countr_zero(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T equal(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: core/range/detail/misc.hpp:28
constexpr auto all
Returns a view that includes all elements of the range argument.
Definition: all_view.hpp:204
std::tuple< std::vector< cigar >, int32_t, int32_t > parse_cigar(cigar_input_type &&cigar_input)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: io/sam_file/detail/cigar.hpp:148
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition: io/sam_file/detail/cigar.hpp:182
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:45
void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition: io/sam_file/detail/cigar.hpp:105
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:42
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly_view.hpp:590
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf_view.hpp:107
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:470
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: type_pack/traits.hpp:164
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
std::string to_string(value_type &&... values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
T push_back(T... args)
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Stores all fixed length variables which can be read/written directly by reinterpreting the binary str...
Definition: format_bam.hpp:145
uint32_t bin
The bin number.
Definition: format_bam.hpp:151
sam_flag flag
The flag value (uint16_t enum).
Definition: format_bam.hpp:153
uint32_t n_cigar_op
The number of cigar operations of the alignment.
Definition: format_bam.hpp:152
int32_t next_refID
The reference id of the mate.
Definition: format_bam.hpp:155
int32_t l_seq
The number of bases of the read sequence.
Definition: format_bam.hpp:154
int32_t refID
The reference id the read was mapped to.
Definition: format_bam.hpp:147
int32_t pos
The begin position of the alignment.
Definition: format_bam.hpp:148
int32_t block_size
The size in bytes of the whole BAM record.
Definition: format_bam.hpp:146
uint32_t l_read_name
The length of the read name including the \0 character.
Definition: format_bam.hpp:149
int32_t tlen
The template length of the read and its mate.
Definition: format_bam.hpp:157
uint32_t mapq
The mapping quality.
Definition: format_bam.hpp:150
int32_t next_pos
The begin position of the mate alignment.
Definition: format_bam.hpp:156
Thrown if information given to output format didn't match expectations.
Definition: io/exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sam_file/input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: sam_file/output_options.hpp:26
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T visit(T... args)