Skip to content

AlignmentTagger

An object to facilitate adding tags to alignments in a bam file

AlignmentTagger

Bases: BarcodeParser

Given an indexed fasta file (genome), id length and insertion length, this object can returned a read tagged with the RG, XS and XZ tags

Source code in callingcardstools/Alignment/AlignmentTagger.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
class AlignmentTagger(BarcodeParser):
    """Given an indexed fasta file (genome), id length and insertion length,
    this object can returned a read tagged with the RG, XS and XZ tags"""

    _fasta = ""
    _genome = ""

    def __init__(self, barcode_details_json: str, fasta_path: str) -> None:
        """Initializes the AlignmentTagger object with given barcode 
        details and a fasta file.

        Args:
            barcode_details_json (str): The path to a JSON file containing 
                barcode details.
            fasta_path (str): The path to the genome fasta file. A .fai index 
                file created by samtools faidx must exist at the same location.

        Raises:
            FileNotFoundError: Raised if the path to the fasta file or its
                index file doesn't exist.
        """
        super().__init__(barcode_details_json)
        self.fasta = fasta_path
        # open the fasta file as a pysam.FastaFile obj
        self.open()

    def __del__(self):
        """ensure that the genome file is closed when deleted"""
        del self.genome

    @property
    def fasta(self) -> str:
        """path to the fasta file. The index .fai file MUST BE in the
         same directory"""
        return self._fasta

    @fasta.setter
    def fasta(self, new_fasta: str) -> None:
        if not os.path.exists(new_fasta):
            raise FileNotFoundError(
                f'{new_fasta} does not exist -- check path')
        if not os.path.exists(new_fasta+'.fai'):
            raise FileNotFoundError(f"Genome index not found for {new_fasta}. "
                                    f"The index .fai file must exist in same "
                                    f"path. Use samtools faidx to create "
                                    f"an index if one DNE")
        self._fasta = new_fasta

    @property
    def genome(self):
        """pysam FastaFile object"""
        return self._genome

    @genome.setter
    def genome(self, new_genome_obj: pysam.FastaFile):  # pylint:disable=E1101
        self._genome = new_genome_obj

    @genome.deleter
    def genome(self):
        try:
            self._genome.close()
        except AttributeError:
            pass

    def open(self):
        """open the genome file and set the self.genome attribute"""
        self.genome = pysam.FastaFile(self.fasta, self.fasta+'.fai')  # pylint:disable=E1101 # noqa

    def is_open(self):
        """check if genome file is open"""
        return self.genome.is_open()

    def close(self):
        """close the genome file"""
        del self.genome

    def extract_tag_dict(self, id: str) -> dict:
        """given an id string created by ReadParser, parse into a dictionary of
        tags

        Args:
            id (str): id line from a given read in a bam produced from a fastq
                processed by (a script that uses) the ReadParser

        Raises:
            IndexError: Raised if parsing of the id doesn't work as expected

        Returns:
            dict: For example, the id line
                MN00200:647:000H533KW:1:11102:20080:1075_RT-AATTCACTACGTCAACA;RS-TaqAI;TF-ERT1
                would be returned as 
                {'RT': 'AATTCACTACGTCAACA', 'RS': 'TaqAI', 'TF': 'ERT1'}
        """
        try:
            tag_str = id.split('_')[1]
        except IndexError as exc:
            raise IndexError('No read ID present -- '
                             'expecting a string appended to the read '
                             'ID with a _ in the bam') from exc
        try:
            tag_dict = {x.split('-')[0]: x.split('-')[1]
                        for x in tag_str.split(';')}
        except IndexError as exc:
            raise IndexError(f'{tag_str} not formed as expected -- '
                             f'should have format similar to '
                             f'RT-AATTCACTACGTCAACA;RS-TaqAI;TF-ERT1 where '
                             f'different tags are delimited by ; and '
                             f'tag-value pairs are delimited by - ') \
                from exc
        return tag_dict

    def tag_read(self, read, decompose_barcode: bool = True) -> dict:
        """given a AlignedSegment object, add RG, XS and XZ tags

        Args:
            read (AlignedSegment): An aligned segment object -- eg returned
                in a for loop by interating over bam.fetch() object from pysam
            decompose_barcode (bool): if the barcode is appended as a
                read identifer on the bam id line, rather than an already
                decomposed tag string, then extract the barcode and evaluate 
                it against expectations in the barcode_details json.
                Default to True.

        Raises:
            IndexError: Raised if no read ID is present.
            TypeError: Raised with the cigarstring is not parse-able in a
                given read
            ValueError: Raised when the insertion sequence indicies
                are out of bounds

        Returns:
            dict: A dictionary with key:value pairs
                {'read': tagged_read, 'barcode_details': dictionary of barcode
                detals}
        """
        logger.debug(read.query_name)
        # instantiate some dict objects
        tag_dict = dict()
        barcode_dict = dict()

        # decompose_barcode determines whether the function expects to see
        # an undecomposed barcode string in the id location of the read.id
        # or an already parsed barcode string
        if decompose_barcode:
            try:
                tag_str = read.query_name.split('_')[1]
                logger.debug(tag_str)
            except IndexError as exc:
                raise IndexError('No read ID present -- '
                                 'expecting a string appended to the read '
                                 'ID with a _ in the bam') from exc
            barcode_dict = self.decompose_barcode(tag_str)
            for k, v in barcode_dict['details'].items():
                bam_tag = v.get('bam_tag', None)
                if bam_tag:
                    tag_dict[bam_tag] = v.get('query')+'/'+str(v.get('dist'))
        else:
            # add tags from the id line
            for tag, value in self.extract_tag_dict(read.query_name).items():
                tag_dict[tag] = value

        # (using the bitwise operator) check if the read is unmapped,
        # if so, set the region_dict start and end to *, indicating that
        # there is no alignment, and so there is no start and end region for
        # the alignment
        if read.flag & 0x4:
            tag_dict['XS'] = "*"
            tag_dict['XI'] = "*"
            tag_dict['XE'] = "*"
            tag_dict['XZ'] = "*"
        # if the bit flag 0x10 is set, the read reverse strand.
        # Handle accordingly
        elif read.flag & 0x10:

            # A cigartuple looks like [(0,4), (2,2), (1,6),..,(4,68)] if read
            # is reverse complement. If it is forward, it would have the
            # (4,68), in this case, in the first position.
            # The first entry in the tuple is the cigar operation and the
            # second is the length. Note that pysam does order the tuples in
            # the reverse order from the sam cigar specs, so cigar 30M would be
            # (0,30). 4 is cigar S or BAM_CSOFT_CLIP. The list operation below
            # extracts the length of cigar operation 4 and returns a integer.
            # if 4 DNE, then soft_clip_length is 0.
            try:
                soft_clip_length = read.cigartuples[-1][1] \
                    if read.cigartuples[-1][0] == 4 \
                    else 0
            except TypeError as exc:
                raise TypeError(f"Read {read.query_name}, "
                                f"cigar string {read.cigartuples} "
                                f"is not parse-able") \
                    from exc

            # The insertion point is at the end of the alignment
            # note that this is -1 because per the docs
            # reference_end points to one past the last aligned residue.
            read_5_prime = (read.reference_end-1)+soft_clip_length

            # this is the `insert_length` number bases which precede the
            # read (after adjusting for soft clipping)
            try:
                # if the soft-clip adjustment put the 3 prime end beyond the
                # end of the chrom, set XS to *
                # TODO remove removeprefix removesuffix once ref genome
                # fixed for yeast
                if (read_5_prime >
                   self.genome.get_reference_length(read.reference_name)):
                    tag_dict['XS'] = "*"
                    tag_dict['XI'] = "*"
                    tag_dict['XE'] = "*"
                    tag_dict['XZ'] = "*"
                # if the endpoint of the insertion sequence is off the end of
                # the chrom, set XZ to *
                elif (read_5_prime+1+self.insert_length >=
                      self.genome.get_reference_length(read.reference_name)):
                    tag_dict['XS'] = read_5_prime
                    tag_dict['XI'] = "*"
                    tag_dict['XE'] = "*"
                    tag_dict['XZ'] = "*"
                else:
                    # This is the first base -- adjusted for soft clipping --
                    # in the read which cover the genome
                    tag_dict['XS'] = read_5_prime
                    tag_dict['XI'] = read_5_prime + 1
                    tag_dict['XE'] = read_5_prime + 1 + self.insert_length
                    # TODO remove removeprefix remove suffix once reference
                    # genome is fixed for yeast
                    tag_dict['XZ'] = self.genome.fetch(read.reference_name,
                                                       read_5_prime+1,
                                                       read_5_prime+1 +
                                                       self.insert_length).upper() # noqa
            except ValueError as exc:
                raise ValueError(f"Read {read.query_name}, "
                                 f"insert region {read.reference_name}:{read_5_prime+1}-" # noqa
                                 f"{read_5_prime+1+self.insert_length} "
                                 f"is out of bounds") \
                    from exc

        # else, Read is in the forward orientation. Note that a single end
        # forward strand read with no other flags will have flag 0
        else:

            # see if clause for lengthy explanation. This examines the first
            # operation in the cigar string. If it is a soft clip (code 4),
            # the length of the soft clipping is stored. Else there is 0 soft
            # clipping
            try:
                soft_clip_length = read.cigartuples[0][1] \
                    if read.cigartuples[0][0] == 4 \
                    else 0
            except TypeError as exc:
                raise TypeError(f"Read {read.query_name}, "
                                f"cigar string {read.cigartuples} is not "
                                f"parse-able") \
                    from exc
            # extract insert position
            read_5_prime = read.reference_start - soft_clip_length

            # this is the `insert_length` number bases which precede the
            # read (after adjusting for soft clipping)
            try:
                # if the 5 prime end, after soft clipping, is less than 0, set
                # XS to *
                if (read_5_prime < 0):
                    tag_dict['XS'] = "*"
                    tag_dict['XI'] = "*"
                    tag_dict['XE'] = "*"
                    tag_dict['XZ'] = "*"
                # if the insertion sequence extends beyond the beginning of the
                # chrom, set to *
                elif (read_5_prime-self.insert_length < 0):
                    tag_dict['XS'] = read_5_prime
                    tag_dict['XI'] = "*"
                    tag_dict['XE'] = "*"
                    tag_dict['XZ'] = "*"
                else:
                    # This is the first base -- adjusted for soft clipping --
                    # in the read which cover the genome
                    tag_dict['XS'] = read_5_prime
                    tag_dict['XI'] = read_5_prime - self.insert_length
                    tag_dict['XE'] = read_5_prime
                    # TODO remove the removeprefix removesuffix -- need to 
                    # standardize rob's genome names
                    tag_dict['XZ'] = self.genome.fetch(read.reference_name,
                                                       read_5_prime-self.insert_length, # noqa
                                                       read_5_prime).upper()
            except ValueError as exc:
                raise ValueError(f"Read {read.query_name}, "
                                 f"insert region "
                                 f"{read.reference_name}:{read_5_prime-self.insert_length}-" # noqa
                                 f"{read_5_prime} is out of bounds") from exc

        # Set tags ------------------------------------------------------------
        for tag, tag_str in tag_dict.items():
            read.set_tag(tag, tag_str)

        return {'read': read, 'barcode_details': barcode_dict}

fasta: str property writable

path to the fasta file. The index .fai file MUST BE in the same directory

genome deletable property writable

pysam FastaFile object

__del__()

ensure that the genome file is closed when deleted

Source code in callingcardstools/Alignment/AlignmentTagger.py
42
43
44
def __del__(self):
    """ensure that the genome file is closed when deleted"""
    del self.genome

__init__(barcode_details_json, fasta_path)

Initializes the AlignmentTagger object with given barcode details and a fasta file.

Parameters:

Name Type Description Default
barcode_details_json str

The path to a JSON file containing barcode details.

required
fasta_path str

The path to the genome fasta file. A .fai index file created by samtools faidx must exist at the same location.

required

Raises:

Type Description
FileNotFoundError

Raised if the path to the fasta file or its index file doesn’t exist.

Source code in callingcardstools/Alignment/AlignmentTagger.py
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def __init__(self, barcode_details_json: str, fasta_path: str) -> None:
    """Initializes the AlignmentTagger object with given barcode 
    details and a fasta file.

    Args:
        barcode_details_json (str): The path to a JSON file containing 
            barcode details.
        fasta_path (str): The path to the genome fasta file. A .fai index 
            file created by samtools faidx must exist at the same location.

    Raises:
        FileNotFoundError: Raised if the path to the fasta file or its
            index file doesn't exist.
    """
    super().__init__(barcode_details_json)
    self.fasta = fasta_path
    # open the fasta file as a pysam.FastaFile obj
    self.open()

close()

close the genome file

Source code in callingcardstools/Alignment/AlignmentTagger.py
88
89
90
def close(self):
    """close the genome file"""
    del self.genome

extract_tag_dict(id)

given an id string created by ReadParser, parse into a dictionary of tags

Parameters:

Name Type Description Default
id str

id line from a given read in a bam produced from a fastq processed by (a script that uses) the ReadParser

required

Raises:

Type Description
IndexError

Raised if parsing of the id doesn’t work as expected

Returns:

Name Type Description
dict dict

For example, the id line MN00200:647:000H533KW:1:11102:20080:1075_RT-AATTCACTACGTCAACA;RS-TaqAI;TF-ERT1 would be returned as {‘RT’: ‘AATTCACTACGTCAACA’, ‘RS’: ‘TaqAI’, ‘TF’: ‘ERT1’}

Source code in callingcardstools/Alignment/AlignmentTagger.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def extract_tag_dict(self, id: str) -> dict:
    """given an id string created by ReadParser, parse into a dictionary of
    tags

    Args:
        id (str): id line from a given read in a bam produced from a fastq
            processed by (a script that uses) the ReadParser

    Raises:
        IndexError: Raised if parsing of the id doesn't work as expected

    Returns:
        dict: For example, the id line
            MN00200:647:000H533KW:1:11102:20080:1075_RT-AATTCACTACGTCAACA;RS-TaqAI;TF-ERT1
            would be returned as 
            {'RT': 'AATTCACTACGTCAACA', 'RS': 'TaqAI', 'TF': 'ERT1'}
    """
    try:
        tag_str = id.split('_')[1]
    except IndexError as exc:
        raise IndexError('No read ID present -- '
                         'expecting a string appended to the read '
                         'ID with a _ in the bam') from exc
    try:
        tag_dict = {x.split('-')[0]: x.split('-')[1]
                    for x in tag_str.split(';')}
    except IndexError as exc:
        raise IndexError(f'{tag_str} not formed as expected -- '
                         f'should have format similar to '
                         f'RT-AATTCACTACGTCAACA;RS-TaqAI;TF-ERT1 where '
                         f'different tags are delimited by ; and '
                         f'tag-value pairs are delimited by - ') \
            from exc
    return tag_dict

is_open()

check if genome file is open

Source code in callingcardstools/Alignment/AlignmentTagger.py
84
85
86
def is_open(self):
    """check if genome file is open"""
    return self.genome.is_open()

open()

open the genome file and set the self.genome attribute

Source code in callingcardstools/Alignment/AlignmentTagger.py
80
81
82
def open(self):
    """open the genome file and set the self.genome attribute"""
    self.genome = pysam.FastaFile(self.fasta, self.fasta+'.fai')  # pylint:disable=E1101 # noqa

tag_read(read, decompose_barcode=True)

given a AlignedSegment object, add RG, XS and XZ tags

Parameters:

Name Type Description Default
read AlignedSegment

An aligned segment object – eg returned in a for loop by interating over bam.fetch() object from pysam

required
decompose_barcode bool

if the barcode is appended as a read identifer on the bam id line, rather than an already decomposed tag string, then extract the barcode and evaluate it against expectations in the barcode_details json. Default to True.

True

Raises:

Type Description
IndexError

Raised if no read ID is present.

TypeError

Raised with the cigarstring is not parse-able in a given read

ValueError

Raised when the insertion sequence indicies are out of bounds

Returns:

Name Type Description
dict dict

A dictionary with key:value pairs {‘read’: tagged_read, ‘barcode_details’: dictionary of barcode detals}

Source code in callingcardstools/Alignment/AlignmentTagger.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
def tag_read(self, read, decompose_barcode: bool = True) -> dict:
    """given a AlignedSegment object, add RG, XS and XZ tags

    Args:
        read (AlignedSegment): An aligned segment object -- eg returned
            in a for loop by interating over bam.fetch() object from pysam
        decompose_barcode (bool): if the barcode is appended as a
            read identifer on the bam id line, rather than an already
            decomposed tag string, then extract the barcode and evaluate 
            it against expectations in the barcode_details json.
            Default to True.

    Raises:
        IndexError: Raised if no read ID is present.
        TypeError: Raised with the cigarstring is not parse-able in a
            given read
        ValueError: Raised when the insertion sequence indicies
            are out of bounds

    Returns:
        dict: A dictionary with key:value pairs
            {'read': tagged_read, 'barcode_details': dictionary of barcode
            detals}
    """
    logger.debug(read.query_name)
    # instantiate some dict objects
    tag_dict = dict()
    barcode_dict = dict()

    # decompose_barcode determines whether the function expects to see
    # an undecomposed barcode string in the id location of the read.id
    # or an already parsed barcode string
    if decompose_barcode:
        try:
            tag_str = read.query_name.split('_')[1]
            logger.debug(tag_str)
        except IndexError as exc:
            raise IndexError('No read ID present -- '
                             'expecting a string appended to the read '
                             'ID with a _ in the bam') from exc
        barcode_dict = self.decompose_barcode(tag_str)
        for k, v in barcode_dict['details'].items():
            bam_tag = v.get('bam_tag', None)
            if bam_tag:
                tag_dict[bam_tag] = v.get('query')+'/'+str(v.get('dist'))
    else:
        # add tags from the id line
        for tag, value in self.extract_tag_dict(read.query_name).items():
            tag_dict[tag] = value

    # (using the bitwise operator) check if the read is unmapped,
    # if so, set the region_dict start and end to *, indicating that
    # there is no alignment, and so there is no start and end region for
    # the alignment
    if read.flag & 0x4:
        tag_dict['XS'] = "*"
        tag_dict['XI'] = "*"
        tag_dict['XE'] = "*"
        tag_dict['XZ'] = "*"
    # if the bit flag 0x10 is set, the read reverse strand.
    # Handle accordingly
    elif read.flag & 0x10:

        # A cigartuple looks like [(0,4), (2,2), (1,6),..,(4,68)] if read
        # is reverse complement. If it is forward, it would have the
        # (4,68), in this case, in the first position.
        # The first entry in the tuple is the cigar operation and the
        # second is the length. Note that pysam does order the tuples in
        # the reverse order from the sam cigar specs, so cigar 30M would be
        # (0,30). 4 is cigar S or BAM_CSOFT_CLIP. The list operation below
        # extracts the length of cigar operation 4 and returns a integer.
        # if 4 DNE, then soft_clip_length is 0.
        try:
            soft_clip_length = read.cigartuples[-1][1] \
                if read.cigartuples[-1][0] == 4 \
                else 0
        except TypeError as exc:
            raise TypeError(f"Read {read.query_name}, "
                            f"cigar string {read.cigartuples} "
                            f"is not parse-able") \
                from exc

        # The insertion point is at the end of the alignment
        # note that this is -1 because per the docs
        # reference_end points to one past the last aligned residue.
        read_5_prime = (read.reference_end-1)+soft_clip_length

        # this is the `insert_length` number bases which precede the
        # read (after adjusting for soft clipping)
        try:
            # if the soft-clip adjustment put the 3 prime end beyond the
            # end of the chrom, set XS to *
            # TODO remove removeprefix removesuffix once ref genome
            # fixed for yeast
            if (read_5_prime >
               self.genome.get_reference_length(read.reference_name)):
                tag_dict['XS'] = "*"
                tag_dict['XI'] = "*"
                tag_dict['XE'] = "*"
                tag_dict['XZ'] = "*"
            # if the endpoint of the insertion sequence is off the end of
            # the chrom, set XZ to *
            elif (read_5_prime+1+self.insert_length >=
                  self.genome.get_reference_length(read.reference_name)):
                tag_dict['XS'] = read_5_prime
                tag_dict['XI'] = "*"
                tag_dict['XE'] = "*"
                tag_dict['XZ'] = "*"
            else:
                # This is the first base -- adjusted for soft clipping --
                # in the read which cover the genome
                tag_dict['XS'] = read_5_prime
                tag_dict['XI'] = read_5_prime + 1
                tag_dict['XE'] = read_5_prime + 1 + self.insert_length
                # TODO remove removeprefix remove suffix once reference
                # genome is fixed for yeast
                tag_dict['XZ'] = self.genome.fetch(read.reference_name,
                                                   read_5_prime+1,
                                                   read_5_prime+1 +
                                                   self.insert_length).upper() # noqa
        except ValueError as exc:
            raise ValueError(f"Read {read.query_name}, "
                             f"insert region {read.reference_name}:{read_5_prime+1}-" # noqa
                             f"{read_5_prime+1+self.insert_length} "
                             f"is out of bounds") \
                from exc

    # else, Read is in the forward orientation. Note that a single end
    # forward strand read with no other flags will have flag 0
    else:

        # see if clause for lengthy explanation. This examines the first
        # operation in the cigar string. If it is a soft clip (code 4),
        # the length of the soft clipping is stored. Else there is 0 soft
        # clipping
        try:
            soft_clip_length = read.cigartuples[0][1] \
                if read.cigartuples[0][0] == 4 \
                else 0
        except TypeError as exc:
            raise TypeError(f"Read {read.query_name}, "
                            f"cigar string {read.cigartuples} is not "
                            f"parse-able") \
                from exc
        # extract insert position
        read_5_prime = read.reference_start - soft_clip_length

        # this is the `insert_length` number bases which precede the
        # read (after adjusting for soft clipping)
        try:
            # if the 5 prime end, after soft clipping, is less than 0, set
            # XS to *
            if (read_5_prime < 0):
                tag_dict['XS'] = "*"
                tag_dict['XI'] = "*"
                tag_dict['XE'] = "*"
                tag_dict['XZ'] = "*"
            # if the insertion sequence extends beyond the beginning of the
            # chrom, set to *
            elif (read_5_prime-self.insert_length < 0):
                tag_dict['XS'] = read_5_prime
                tag_dict['XI'] = "*"
                tag_dict['XE'] = "*"
                tag_dict['XZ'] = "*"
            else:
                # This is the first base -- adjusted for soft clipping --
                # in the read which cover the genome
                tag_dict['XS'] = read_5_prime
                tag_dict['XI'] = read_5_prime - self.insert_length
                tag_dict['XE'] = read_5_prime
                # TODO remove the removeprefix removesuffix -- need to 
                # standardize rob's genome names
                tag_dict['XZ'] = self.genome.fetch(read.reference_name,
                                                   read_5_prime-self.insert_length, # noqa
                                                   read_5_prime).upper()
        except ValueError as exc:
            raise ValueError(f"Read {read.query_name}, "
                             f"insert region "
                             f"{read.reference_name}:{read_5_prime-self.insert_length}-" # noqa
                             f"{read_5_prime} is out of bounds") from exc

    # Set tags ------------------------------------------------------------
    for tag, tag_str in tag_dict.items():
        read.set_tag(tag, tag_str)

    return {'read': read, 'barcode_details': barcode_dict}