|
template<typename stream_type , typename seq_legal_alph_type , typename ref_seqs_type , typename ref_ids_type , typename stream_pos_type , typename seq_type , typename id_type , typename offset_type , typename ref_seq_type , typename ref_id_type , typename ref_offset_type , typename align_type , typename cigar_type , typename flag_type , typename mapq_type , typename qual_type , typename mate_type , typename tag_dict_type , typename e_value_type , typename bit_score_type > |
void | read_alignment_record (stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score) |
| Read from the specified stream and back-insert into the given field buffers. More...
|
|
template<typename stream_type , typename header_type , typename seq_type , typename id_type , typename ref_seq_type , typename ref_id_type , typename align_type , typename cigar_type , typename qual_type , typename mate_type , typename tag_dict_type > |
void | write_alignment_record (stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score) |
| Write the given fields to the specified stream. More...
|
|
|
template<typename cigar_input_type > |
auto | parse_binary_cigar (cigar_input_type &&cigar_input, uint16_t n_cigar_op) const |
| Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)). More...
|
|
template<typename stream_view_type > |
void | read_float_byte_field (stream_view_type &&stream_view, float &target) |
| Reads a float field from binary stream by directly reinterpreting the bits. More...
|
|
template<typename stream_view_type , std::integral number_type> |
void | read_integral_byte_field (stream_view_type &&stream_view, number_type &target) |
| Reads a arithmetic field from binary stream by directly reinterpreting the bits. More...
|
|
template<typename stream_view_type > |
void | read_sam_dict_field (stream_view_type &&stream_view, sam_tag_dictionary &target) |
| Reads the optional tag fields into the seqan3::sam_tag_dictionary. More...
|
|
template<typename stream_view_type , typename value_type > |
void | read_sam_dict_vector (seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view, value_type const &value) |
| Reads a list of values separated by comma as it is the case for SAM tag arrays. More...
|
|
template<typename ref_id_type , typename ref_id_tmp_type , typename header_type , typename ref_seqs_type > |
void | check_and_assign_ref_id (ref_id_type &ref_id, ref_id_tmp_type &ref_id_tmp, header_type &header, ref_seqs_type &) |
| Checks for known reference ids or adds a new reference is and assigns a reference id to ref_id . More...
|
|
template<typename align_type , typename ref_seqs_type > |
void | construct_alignment (align_type &align, std::vector< cigar > &cigar_vector, int32_t rid, ref_seqs_type &ref_seqs, int32_t ref_start, size_t ref_length) |
| Construct the field::alignment depending on the given information. More...
|
|
template<typename stream_view_t , arithmetic arithmetic_target_type> |
void | read_arithmetic_field (stream_view_t &&stream_view, arithmetic_target_type &arithmetic_target) |
| Reads arithmetic fields using std::from_chars. More...
|
|
template<typename stream_view_t > |
void | read_byte_field (stream_view_t &&stream_view, std::byte &byte_target) |
| Reads std::byte fields using std::from_chars. More...
|
|
template<typename stream_view_type , std::ranges::forward_range target_range_type> |
void | read_forward_range_field (stream_view_type &&stream_view, target_range_type &target) |
| Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to. More...
|
|
template<typename stream_view_type , typename ref_ids_type , typename ref_seqs_type > |
void | read_header (stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &) |
| Reads the SAM header. More...
|
|
void | transfer_soft_clipping_to (std::vector< cigar > const &cigar_vector, int32_t &sc_begin, int32_t &sc_end) const |
| Transfer soft clipping information from the cigar_vector to sc_begin and sc_end . More...
|
|
template<typename stream_t , typename ref_ids_type > |
void | write_header (stream_t &stream, sam_file_output_options const &options, sam_file_header< ref_ids_type > &header) |
| Writes the SAM header. More...
|
|
| format_sam_base ()=default |
| Defaulted.
|
|
| format_sam_base (format_sam_base const &)=default |
| Defaulted.
|
|
format_sam_base & | operator= (format_sam_base const &)=default |
| Defaulted.
|
|
| format_sam_base (format_sam_base &&)=default |
| Defaulted.
|
|
format_sam_base & | operator= (format_sam_base &&)=default |
| Defaulted.
|
|
| ~format_sam_base ()=default |
| Defaulted.
|
|
The BAM format.
The BAM format is the binary version of the SAM format:
Introduction
SAM is often used for storing alignments of several read sequences against one or more reference sequences. See the article on wikipedia for an introduction of the format or look into the official SAM format specifications. SeqAn implements version 1.6 of the SAM specification.
Take a look at our tutorial SAM Input and Output in SeqAn for a walk through of how to read alignment files.
fields_specialisation
The SAM format provides the following fields: seqan3::field::alignment, seqan3::field::seq, seqan3::field::qual, seqan3::field::id, seqan3::field::ref_seq, seqan3::field::ref_id seqan3::field::ref_ossfet, seqan3::field::offset, seqan3::field::flag, seqan3::field::mapq and seqan3::field::mate. In addition there is the seqan3::field::header_ptr, which is usually only used internally to provide the range-based functionality of the file.
None of the fields are required when writing. If they are not given, a default value of '0' for numeric fields and '*' for other fields is used.
SAM format columns -> fields
Since many users will be accustomed to the columns of the SAM format, here is a mapping of the common SAM format columns to the SeqAn record fields:
The (read sequence/query) OFFSET will be required to store the soft clipping information at the read start (end clipping will be automatically deduced by how much the read sequence length + offset is larger than the alignment length).
Note: SeqAn currently does not support hard clipping. When reading SAM, hard-clipping is discarded; but the resulting alignment/sequence combination is still valid.
Format Check
The format checks are implemented according to the official SAM format specifications in order to ensure correct SAM file output.
If a non-recoverable format violation is encountered on reading, or you specify invalid values/combinations when writing, seqan3::format_error is thrown.
Header implementation
The SAM header (if present) is read/written once in the beginning before the first record is read/written.
template<typename cigar_input_type >
auto seqan3::format_bam::parse_binary_cigar |
( |
cigar_input_type && |
cigar_input, |
|
|
uint16_t |
n_cigar_op |
|
) |
| const |
|
inlineprivate |
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
- Template Parameters
-
cigar_input_type | The type of a single pass input view over the cigar string; must model std::ranges::input_range. |
- Parameters
-
[in] | cigar_input | The single pass input view over the cigar string to parse. |
[in] | n_cigar_op | The number of cigar elements to read from the cigar_input. |
- Returns
- A tuple of size three containing (1) std::vector over seqan3::cigar, that describes the alignment, (2) the aligned reference length, (3) the aligned query sequence length.
For example, the view over the cigar string "1H4M1D2M2S" will return {[(H,1), (M,4), (D,1), (M,2), (S,2)], 7, 6}
.
template<typename stream_type , typename seq_legal_alph_type , typename ref_seqs_type , typename ref_ids_type , typename stream_pos_type , typename seq_type , typename id_type , typename offset_type , typename ref_seq_type , typename ref_id_type , typename ref_offset_type , typename align_type , typename cigar_type , typename flag_type , typename mapq_type , typename qual_type , typename mate_type , typename tag_dict_type , typename e_value_type , typename bit_score_type >
void seqan3::format_bam::read_alignment_record |
( |
stream_type & |
stream, |
|
|
sam_file_input_options< seq_legal_alph_type > const & |
options, |
|
|
ref_seqs_type & |
ref_seqs, |
|
|
sam_file_header< ref_ids_type > & |
header, |
|
|
stream_pos_type & |
position_buffer, |
|
|
seq_type & |
seq, |
|
|
qual_type & |
qual, |
|
|
id_type & |
id, |
|
|
offset_type & |
offset, |
|
|
ref_seq_type & |
ref_seq, |
|
|
ref_id_type & |
ref_id, |
|
|
ref_offset_type & |
ref_offset, |
|
|
align_type & |
align, |
|
|
cigar_type & |
cigar_vector, |
|
|
flag_type & |
flag, |
|
|
mapq_type & |
mapq, |
|
|
mate_type & |
mate, |
|
|
tag_dict_type & |
tag_dict, |
|
|
e_value_type & |
e_value, |
|
|
bit_score_type & |
bit_score |
|
) |
| |
|
inlineprotected |
Read from the specified stream and back-insert into the given field buffers.
- Template Parameters
-
- Parameters
-
Additional requirements
- The function must also accept std::ignore as parameter for any of the fields, except stream, options and header. [This is enforced by the concept checker!]
- In this case the data read for that field shall be discarded by the format.