File size: 10,064 Bytes
18ea056
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from Bio import pairwise2
from collections import defaultdict
import re

# Define important gene regions (positions based on H37Rv)
IMPORTANT_GENES = {
    'rpoB': {'range': (759807, 763325), 'description': 'RNA polymerase β subunit (Rifampicin resistance)'},
    'katG': {'range': (2153889, 2156111), 'description': 'Catalase-peroxidase (Isoniazid resistance)'},
    'inhA': {'range': (1674202, 1675011), 'description': 'Enoyl-ACP reductase (Isoniazid resistance)'},
    'gyrA': {'range': (7302, 9818), 'description': 'DNA gyrase subunit A (Fluoroquinolone resistance)'}
}

def read_fasta_from_upload(uploaded_file):
    """Read a FASTA file from Streamlit upload"""
    content = uploaded_file.getvalue().decode('utf-8').strip()
    parts = content.split('\n', 1)
    sequence = ''.join(parts[1].split('\n')).replace(' ', '')
    return sequence.upper()

def split_genome_into_chunks(sequence, chunk_size=10000, overlap=100):
    """Split genome into manageable chunks for alignment"""
    chunks = []
    positions = []
    for i in range(0, len(sequence), chunk_size - overlap):
        chunk = sequence[i:i + chunk_size]
        chunks.append(chunk)
        positions.append(i)
    return chunks, positions

def find_mutations_in_chunk(ref_chunk, query_chunk, chunk_start):
    """Find mutations in a genome chunk"""
    mutations = []
    
    alignments = pairwise2.align.globalms(ref_chunk, query_chunk,
                                        match=2,
                                        mismatch=-3,
                                        open=-10,
                                        extend=-0.5)
    
    if not alignments:
        return mutations
    
    alignment = alignments[0]
    ref_aligned, query_aligned = alignment[0], alignment[1]
    
    real_pos = 0
    for i in range(len(ref_aligned)):
        if ref_aligned[i] != '-':
            real_pos += 1
            
        if ref_aligned[i] != query_aligned[i]:
            abs_pos = chunk_start + real_pos - 1
            mut = {
                'position': abs_pos,
                'ref_base': ref_aligned[i],
                'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
                'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
                'context': {
                    'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
                    'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
                }
            }
            
            # Check if mutation is in an important gene
            for gene, info in IMPORTANT_GENES.items():
                start, end = info['range']
                if start <= abs_pos <= end:
                    mut['gene'] = gene
                    mut['gene_position'] = abs_pos - start + 1
                    mut['gene_description'] = info['description']
            
            mutations.append(mut)
    
    return mutations

def visualize_mutations(mutations, genome_length):
    """Create mutation visualization plots"""
    # Prepare data for gene region visualization
    gene_regions = []
    for gene, info in IMPORTANT_GENES.items():
        start, end = info['range']
        gene_regions.append({
            'gene': gene,
            'start': start,
            'end': end,
            'y': 1
        })

    # Create genome-wide plot
    fig = go.Figure()

    # Add gene regions as rectangles
    for region in gene_regions:
        fig.add_trace(go.Scatter(
            x=[region['start'], region['end']],
            y=[region['y'], region['y']],
            mode='lines',
            name=region['gene'],
            line=dict(width=10),
            hoverinfo='text',
            hovertext=f"{region['gene']}: {region['start']}-{region['end']}"
        ))

    # Add mutations as scatter points
    mutation_data = pd.DataFrame(mutations)
    if not mutation_data.empty:
        fig.add_trace(go.Scatter(
            x=mutation_data['position'],
            y=[1.1] * len(mutation_data),
            mode='markers',
            name='Mutations',
            marker=dict(
                color=['red' if t == 'SNP' else 'blue' for t in mutation_data['type']],
                size=8
            ),
            hoverinfo='text',
            hovertext=mutation_data.apply(
                lambda x: f"Position: {x['position']}<br>"
                         f"Type: {x['type']}<br>"
                         f"Change: {x['ref_base']}->{x['query_base']}", 
                axis=1
            )
        ))

    fig.update_layout(
        title="Genome-wide Mutation Distribution",
        xaxis_title="Genome Position",
        yaxis_visible=False,
        showlegend=True,
        height=400
    )

    return fig

def analyze_mutations(mutations):
    """Generate comprehensive mutation statistics"""
    stats = {
        'total_mutations': len(mutations),
        'snps': len([m for m in mutations if m['type'] == 'SNP']),
        'indels': len([m for m in mutations if m['type'] == 'INDEL']),
        'by_gene': defaultdict(int),
        'important_mutations': []
    }
    
    for mut in mutations:
        if 'gene' in mut:
            stats['by_gene'][mut['gene']] += 1
            stats['important_mutations'].append(mut)
    
    return stats

def main():
    st.title("M. tuberculosis Full Genome Comparison")
    
    st.markdown("""
    This tool performs whole-genome comparison of M. tuberculosis strains, identifying mutations 
    and analyzing resistance-associated genes.
    
    **Instructions:**
    1. Upload your reference genome (typically H37Rv)
    2. Upload your query genome (clinical isolate)
    3. Configure analysis parameters if needed
    4. Run the analysis
    """)
    
    # File upload
    col1, col2 = st.columns(2)
    with col1:
        reference_file = st.file_uploader("Reference Genome (FASTA)", type=['fasta', 'fa'])
    with col2:
        query_file = st.file_uploader("Query Genome (FASTA)", type=['fasta', 'fa'])
    
    # Analysis parameters
    with st.expander("Advanced Settings"):
        chunk_size = st.slider("Analysis chunk size (bp)", 5000, 20000, 10000, 1000)
        overlap = st.slider("Chunk overlap (bp)", 50, 200, 100, 10)
    
    if reference_file and query_file:
        if st.button("Run Analysis"):
            with st.spinner("Analyzing genomes..."):
                try:
                    # Read sequences
                    ref_genome = read_fasta_from_upload(reference_file)
                    query_genome = read_fasta_from_upload(query_file)
                    
                    # Show progress
                    progress_bar = st.progress(0)
                    status = st.empty()
                    
                    # Split genomes
                    status.text("Splitting genomes into chunks...")
                    ref_chunks, chunk_positions = split_genome_into_chunks(ref_genome, chunk_size, overlap)
                    query_chunks, _ = split_genome_into_chunks(query_genome, chunk_size, overlap)
                    
                    # Process chunks
                    status.text("Analyzing mutations...")
                    all_mutations = []
                    total_chunks = len(ref_chunks)
                    
                    for i, (ref_chunk, query_chunk, chunk_start) in enumerate(zip(ref_chunks, query_chunks, chunk_positions)):
                        progress_bar.progress((i + 1) / total_chunks)
                        mutations = find_mutations_in_chunk(ref_chunk, query_chunk, chunk_start)
                        all_mutations.extend(mutations)
                    
                    # Analysis complete
                    progress_bar.empty()
                    status.empty()
                    
                    # Generate results
                    stats = analyze_mutations(all_mutations)
                    
                    # Display results
                    st.success("Analysis complete!")
                    
                    # Summary statistics
                    st.header("Results Summary")
                    col1, col2, col3 = st.columns(3)
                    col1.metric("Total Mutations", stats['total_mutations'])
                    col2.metric("SNPs", stats['snps'])
                    col3.metric("INDELs", stats['indels'])
                    
                    # Genome-wide visualization
                    st.plotly_chart(visualize_mutations(all_mutations, len(ref_genome)))
                    
                    # Gene-specific results
                    st.header("Resistance-Associated Genes")
                    gene_mutations = pd.DataFrame([
                        {"Gene": gene, "Mutations": count, "Description": IMPORTANT_GENES[gene]['description']}
                        for gene, count in stats['by_gene'].items()
                    ])
                    
                    if not gene_mutations.empty:
                        st.dataframe(gene_mutations)
                    
                    # Detailed mutation table
                    if stats['important_mutations']:
                        st.header("Detailed Mutation Analysis")
                        mutations_df = pd.DataFrame(stats['important_mutations'])
                        st.dataframe(mutations_df)
                        
                        # Download option
                        csv = mutations_df.to_csv(index=False)
                        st.download_button(
                            "Download Results (CSV)",
                            csv,
                            "mtb_mutations.csv",
                            "text/csv",
                            key='download-csv'
                        )
                    
                except Exception as e:
                    st.error(f"Analysis error: {str(e)}")

if __name__ == "__main__":
    main()