File size: 18,433 Bytes
129decb
 
edf285e
129decb
 
 
 
a8a77be
129decb
edf285e
 
 
a8a77be
 
 
 
 
 
 
edf285e
a8a77be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edf285e
a8a77be
edf285e
a8a77be
 
 
 
 
 
 
 
 
 
 
 
 
 
edf285e
 
 
a8a77be
 
38d4316
 
 
 
 
 
 
 
 
a8a77be
 
 
38d4316
 
 
 
 
 
 
 
129decb
edf285e
 
 
129decb
edf285e
38d4316
 
 
 
 
edf285e
38d4316
 
 
 
129decb
edf285e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129decb
edf285e
 
 
 
 
38d4316
edf285e
 
38d4316
 
 
edf285e
38d4316
edf285e
38d4316
 
edf285e
 
 
 
 
 
 
 
38d4316
edf285e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38d4316
edf285e
 
 
 
38d4316
 
edf285e
 
 
 
 
 
 
 
 
129decb
a8a77be
edf285e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8a77be
 
 
edf285e
 
 
129decb
edf285e
129decb
 
a8a77be
 
edf285e
 
 
129decb
 
38d4316
 
 
a8a77be
38d4316
 
 
 
 
a8a77be
129decb
a8a77be
129decb
edf285e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38d4316
edf285e
 
 
129decb
edf285e
 
 
38d4316
edf285e
 
 
 
 
 
 
129decb
edf285e
 
129decb
edf285e
 
 
 
a8a77be
edf285e
 
 
 
 
38d4316
edf285e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a8a77be
edf285e
a8a77be
edf285e
 
 
 
a8a77be
edf285e
 
 
 
 
 
 
 
 
 
 
38d4316
edf285e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38d4316
edf285e
 
 
 
 
 
 
 
129decb
edf285e
129decb
edf285e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
import streamlit as st
from Bio import pairwise2
from Bio.Seq import Seq
import re
from collections import defaultdict
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# -------------------------------------------------
# 1. Define important gene regions and their associated resistance patterns
# -------------------------------------------------
RESISTANCE_GENES = {
    'rpoB': {
        'start': 759807,
        'end': 763325,
        'description': 'RNA polymerase β subunit',
        'drug': 'Rifampicin',
        'mutations': {
            # Example: codon 531: from S -> L
            '531': {'from': 'S', 'to': ['L'], 'freq': 'High', 'confidence': 'High'},
            '526': {'from': 'H', 'to': ['Y', 'D', 'R'], 'freq': 'High', 'confidence': 'High'},
            '516': {'from': 'D', 'to': ['V', 'G'], 'freq': 'Moderate', 'confidence': 'High'},
            '511': {'from': 'L', 'to': ['P'], 'freq': 'Low', 'confidence': 'Moderate'}
        }
    },
    'katG': {
        'start': 2153889,
        'end': 2156111,
        'description': 'Catalase-peroxidase',
        'drug': 'Isoniazid',
        'mutations': {
            '315': {'from': 'S', 'to': ['T', 'N'], 'freq': 'High', 'confidence': 'High'},
            '463': {'from': 'R', 'to': ['L'], 'freq': 'Moderate', 'confidence': 'Moderate'}
        }
    },
    'inhA': {
        'start': 1674202,
        'end': 1675011,
        'description': 'Enoyl-ACP reductase',
        'drug': 'Isoniazid/Ethionamide',
        'mutations': {
            # Negative positions typically refer to promoter/regulatory sites. Compare nucleotides directly.
            '-15': {'from': 'C', 'to': ['T'], 'freq': 'High', 'confidence': 'High'},
            '94':  {'from': 'S', 'to': ['A'], 'freq': 'Moderate', 'confidence': 'High'}
        }
    },
    'gyrA': {
        'start': 7302,
        'end': 9818,
        'description': 'DNA gyrase subunit A',
        'drug': 'Fluoroquinolones',
        'mutations': {
            '90': {'from': 'A', 'to': ['V'], 'freq': 'High', 'confidence': 'High'},
            '94': {'from': 'D', 'to': ['G', 'A', 'N'], 'freq': 'High', 'confidence': 'High'}
        }
    }
}

# -------------------------------------------------
# 2. File reading functions
# -------------------------------------------------
def read_fasta_file(file_path):
    """Read a FASTA file from disk"""
    try:
        with open(file_path, 'r') as handle:
            content = handle.read().strip()
            parts = content.split('\n', 1)
            sequence = ''.join(parts[1].split('\n')).replace(' ', '')
            return sequence.upper()
    except Exception as e:
        st.error(f"Error reading file {file_path}: {str(e)}")
        return None

def read_fasta_from_upload(uploaded_file):
    """Read a FASTA file from Streamlit upload"""
    try:
        content = uploaded_file.getvalue().decode('utf-8').strip()
        parts = content.split('\n', 1)
        sequence = ''.join(parts[1].split('\n')).replace(' ', '')
        return sequence.upper()
    except Exception as e:
        st.error(f"Error reading uploaded file: {str(e)}")
        return None

# -------------------------------------------------
# 3. Region extraction function
# -------------------------------------------------
def extract_gene_region(genome_seq, gene_start, gene_end):
    """Extract a gene region with additional 200bp on each side for alignment context."""
    try:
        flank = 200
        start = max(0, gene_start - flank)
        end = min(len(genome_seq), gene_end + flank)
        extracted_seq = genome_seq[start:end]
        st.write(f"Extracted sequence length: {len(extracted_seq)}bp (for region {gene_start}-{gene_end})")
        return extracted_seq, start
    except Exception as e:
        st.error(f"Error extracting gene region: {str(e)}")
        return None, None

# -------------------------------------------------
# 4. Codon-level extraction from aligned sequences
# -------------------------------------------------
def extract_codon_alignment(ref_aligned, query_aligned, gene_start, gene_end, offset):
    """
    Convert the nucleotide alignment into a list of codon diffs (ref_aa, query_aa, codon_number).
    We skip codons that have a gap in the reference, because we can’t reliably translate them.
    """
    codon_list = []
    real_pos = 0  # tracks how many non-gap reference bases we've seen

    ref_codon = []
    query_codon = []
    
    for i in range(len(ref_aligned)):
        ref_base = ref_aligned[i]
        query_base = query_aligned[i]

        # Only increment real_pos if the reference base is not a gap
        if ref_base != '-':
            real_pos += 1
            ref_codon.append(ref_base)
            query_codon.append(query_base if query_base != '-' else 'N')  # 'N' for missing

            # Once we have 3 bases for the reference, translate
            if len(ref_codon) == 3:
                # Example: If real_pos is 3, that means we just completed codon #1 for this region, etc.
                codon_start_pos = offset + (real_pos - 3)  # The first base of this codon in genome coords
                
                # Check if at least part of this codon is in the gene boundaries
                # Typically we want the entire codon to be within gene_start..gene_end
                if (codon_start_pos >= gene_start) and (codon_start_pos + 2 <= gene_end):
                    ref_aa   = str(Seq(''.join(ref_codon)).translate())
                    query_aa = str(Seq(''.join(query_codon)).translate())

                    # codon_number in the gene
                    gene_nt_pos = codon_start_pos - gene_start + 1  # nucleotide offset into the gene
                    # e.g., if gene_nt_pos is 1..3 => codon_number = 1, if 4..6 => codon_number = 2, etc.
                    codon_number = (gene_nt_pos - 1) // 3 + 1

                    if ref_aa != query_aa:
                        codon_list.append({
                            'codon_number': codon_number,
                            'ref_aa': ref_aa,
                            'query_aa': query_aa
                        })

                # Reset for the next codon
                ref_codon = []
                query_codon = []
    
    return codon_list

# -------------------------------------------------
# 5. Find both codon-level and promoter-level mutations
# -------------------------------------------------
def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
    """
    1) Align the nucleotide sequences for the gene region.
    2) Extract codon-level amino-acid differences for coding changes.
    3) Identify direct nucleotide changes for promoter or negative positions (like -15).
    """
    try:
        # Align the two nucleotide sequences
        alignments = pairwise2.align.globalms(ref_seq, query_seq, match=2, mismatch=-3, open=-10, extend=-0.5)
        
        if not alignments:
            st.warning("No alignments found")
            return {'codon_diffs': [], 'nt_diffs': []}
        
        # Take the best-scoring alignment
        alignment = alignments[0]
        ref_aligned, query_aligned = alignment[0], alignment[1]

        # 1) Extract codon-level diffs
        codon_diffs = extract_codon_alignment(ref_aligned, query_aligned, gene_start, gene_end, offset)

        # 2) Identify direct nucleotide differences for negative or regulatory positions
        #    We only care about positions that are outside the coding region or specifically listed as negative
        nt_diffs = []
        ref_pos = 0  # tracks real position in reference
        for i in range(len(ref_aligned)):
            ref_base = ref_aligned[i]
            query_base = query_aligned[i]

            # only increment ref_pos if ref_base isn't a gap
            if ref_base != '-':
                ref_pos += 1
                actual_genome_pos = offset + ref_pos  # actual coordinate in entire genome

                # Check if there's a mismatch
                if ref_base != query_base and (query_base != '-'):
                    # If the position is < gene_start, it might be negative or promoter region
                    # Or if the position is > gene_end, it might be some flanking region
                    # We'll store it, and 'analyze_resistance' can figure out if it's relevant
                    if actual_genome_pos < gene_start or actual_genome_pos > gene_end:
                        # It's outside the coding region
                        nt_diffs.append({
                            'genome_pos': actual_genome_pos,
                            'ref_base': ref_base,
                            'query_base': query_base
                        })
                    else:
                        # Even if it's inside the gene, it might be an in-frame insertion or something
                        # not forming a complete codon in the reference. We'll store it anyway.
                        nt_diffs.append({
                            'genome_pos': actual_genome_pos,
                            'ref_base': ref_base,
                            'query_base': query_base
                        })
        
        return {
            'codon_diffs': codon_diffs,
            'nt_diffs': nt_diffs
        }
    except Exception as e:
        st.error(f"Error in mutation analysis: {str(e)}")
        return {'codon_diffs': [], 'nt_diffs': []}

# -------------------------------------------------
# 6. Analyze the found mutations for known resistance patterns
# -------------------------------------------------
def analyze_resistance(mutation_data, gene_info):
    """Analyze codon-level amino-acid diffs and any direct nucleotide diffs for known patterns."""
    codon_diffs = mutation_data['codon_diffs']  # list of {codon_number, ref_aa, query_aa}
    nt_diffs = mutation_data['nt_diffs']        # list of {genome_pos, ref_base, query_base}

    resistance_found = []

    # We need to parse the dictionary keys in gene_info['mutations'] (they can be negative or numeric)
    for key_str, pattern in gene_info['mutations'].items():
        try:
            key_val = int(key_str)
        except ValueError:
            # Should never happen if the dictionary is consistent, but just in case
            continue
        
        # If key_val > 0 => it's a codon-based mutation (like 531 for rpoB).
        # If key_val <= 0 => it's a nucleotide-based mutation in promoter or upstream region (like -15).
        if key_val > 0:
            # Codon-based
            for diff in codon_diffs:
                if diff['codon_number'] == key_val:
                    # e.g. pattern['from'] = 'S', pattern['to'] = ['L']
                    if diff['ref_aa'] == pattern['from'] and diff['query_aa'] in pattern['to']:
                        resistance_found.append({
                            'position': key_str,
                            'change': f"{pattern['from']}{key_str}{diff['query_aa']}",
                            'frequency': pattern['freq'],
                            'confidence': pattern['confidence']
                        })
        else:
            # Nucleotide-based (promoter or upstream). 
            # We need to find an nt_diff at that offset from the gene_start.
            # e.g. -15 => actual genome position = gene_start + (-15)
            promoter_genome_pos = gene_info['start'] + key_val
            for diff in nt_diffs:
                if diff['genome_pos'] == promoter_genome_pos:
                    # Check if ref_base = pattern['from'], query_base in pattern['to']
                    if diff['ref_base'] == pattern['from'] and diff['query_base'] in pattern['to']:
                        resistance_found.append({
                            'position': key_str,
                            'change': f"{pattern['from']}{key_str}{diff['query_base']}",
                            'frequency': pattern['freq'],
                            'confidence': pattern['confidence']
                        })
    
    return resistance_found

# -------------------------------------------------
# 7. Main Streamlit App
# -------------------------------------------------
def main():
    st.title("M. tuberculosis Drug Resistance Analysis - FIXED VERSION")
    
    st.markdown("""
    ### Automated Drug Resistance Analysis Tool
    Upload your query genome (clinical isolate) in FASTA format for comparison with H37Rv reference.

    **Note**: This version correctly checks *codon-based* amino-acid mutations (e.g., rpoB S531L)
    and *nucleotide-based* promoter mutations (e.g., inhA -15C>T).
    """)
    
    # Debug mode toggle
    debug_mode = st.checkbox("Enable debug mode")
    
    # Load reference genome
    ref_genome = read_fasta_file("NC_000962.3.fasta")
    if ref_genome:
        st.success(f"Reference genome loaded successfully (length: {len(ref_genome)}bp)")
    else:
        st.error("Failed to load reference genome")
        return
    
    query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
    
    if query_file and st.button("Analyze Drug Resistance"):
        query_genome = read_fasta_from_upload(query_file)
        if query_genome:
            st.success(f"Query genome loaded successfully (length: {len(query_genome)}bp)")
            
            # Analysis progress tracking
            progress_bar = st.progress(0)
            status_text = st.empty()
            
            # Store all results
            all_results = {}
            
            # Analyze each gene
            for i, (gene, info) in enumerate(RESISTANCE_GENES.items()):
                status_text.text(f"Analyzing {gene} ({info['drug']})...")
                progress_bar.progress((i + 1) / len(RESISTANCE_GENES))
                
                if debug_mode:
                    st.subheader(f"Analyzing {gene}")
                    st.write(f"Gene region: {info['start']}-{info['end']}")
                
                # Extract regions
                ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end'])
                query_region, _ = extract_gene_region(query_genome, info['start'], info['end'])
                
                if ref_region and query_region:
                    # Find mutations (codon-level + any promoter-level)
                    mutation_data = find_mutations_with_context(
                        ref_region, query_region,
                        info['start'], info['end'],
                        ref_start
                    )
                    
                    # Analyze resistance
                    resistance = analyze_resistance(mutation_data, info)
                    
                    all_results[gene] = {
                        'mutation_data': mutation_data,
                        'resistance': resistance
                    }
                    
                    if debug_mode:
                        st.write(f"Codon-level differences: {len(mutation_data['codon_diffs'])}")
                        st.write(mutation_data['codon_diffs'])
                        st.write(f"Nucleotide-level differences: {len(mutation_data['nt_diffs'])}")
                        st.write(mutation_data['nt_diffs'])
                        
                        st.write(f"Identified {len(resistance)} resistance patterns")
                else:
                    st.error(f"Failed to analyze {gene}")
            
            # Clear progress indicators
            progress_bar.empty()
            status_text.empty()
            
            # Display Results
            st.header("Analysis Results")
            
            # Show results for each gene
            for gene, results in all_results.items():
                st.subheader(f"{gene} Analysis")
                info = RESISTANCE_GENES[gene]
                
                st.write(f"Drug: {info['drug']}")
                
                num_codon_diffs = len(results['mutation_data']['codon_diffs'])
                num_nt_diffs = len(results['mutation_data']['nt_diffs'])
                st.write(f"Total codon-level differences found: {num_codon_diffs}")
                st.write(f"Total nucleotide-level differences found: {num_nt_diffs}")
                
                if results['resistance']:
                    st.warning(f"Potential resistance mutations found in {gene}")
                    resistance_df = pd.DataFrame(results['resistance'])
                    st.dataframe(resistance_df)
                else:
                    st.info(f"No known resistance mutations found in {gene}")
            
            # Download complete results
            if st.button("Download Complete Analysis"):
                # Create detailed report DataFrame
                report_data = []
                for gene, results in all_results.items():
                    # Store codon diffs
                    for diff in results['mutation_data']['codon_diffs']:
                        report_data.append({
                            'Gene': gene,
                            'Drug': RESISTANCE_GENES[gene]['drug'],
                            'Type': 'Codon_diff',
                            **diff
                        })
                    # Store nt diffs
                    for diff in results['mutation_data']['nt_diffs']:
                        report_data.append({
                            'Gene': gene,
                            'Drug': RESISTANCE_GENES[gene]['drug'],
                            'Type': 'Nucleotide_diff',
                            **diff
                        })
                    # Store recognized resistance mutations
                    for res in results['resistance']:
                        report_data.append({
                            'Gene': gene,
                            'Drug': RESISTANCE_GENES[gene]['drug'],
                            'Type': 'Resistance',
                            **res
                        })
                
                report_df = pd.DataFrame(report_data)
                csv = report_df.to_csv(index=False)
                st.download_button(
                    "Download Full Report (CSV)",
                    csv,
                    "mtb_analysis_report_fixed.csv",
                    "text/csv"
                )

# Entry point
if __name__ == "__main__":
    main()