lyimo commited on
Commit
a8a77be
·
verified ·
1 Parent(s): 129decb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +246 -174
app.py CHANGED
@@ -1,213 +1,285 @@
1
- # app.py
2
  import streamlit as st
3
  from Bio import pairwise2
4
  import re
5
  from collections import defaultdict
6
  import pandas as pd
7
  import plotly.express as px
8
- import io
9
 
10
- def read_fasta_from_upload(uploaded_file):
11
- """
12
- Read a FASTA file from Streamlit upload
13
- """
14
- try:
15
- content = uploaded_file.getvalue().decode('utf-8').strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  parts = content.split('\n', 1)
17
  sequence = ''.join(parts[1].split('\n')).replace(' ', '')
18
  return sequence.upper()
19
- except Exception as e:
20
- st.error(f"Error reading uploaded file: {str(e)}")
21
- return None
 
 
 
 
22
 
23
  def extract_gene_region(genome_seq, gene_start, gene_end):
24
- """
25
- Extract a gene region with additional context
26
- """
27
- try:
28
- flank = 200
29
- start = max(0, gene_start - flank)
30
- end = min(len(genome_seq), gene_end + flank)
31
- return genome_seq[start:end], start
32
- except Exception as e:
33
- st.error(f"Error extracting gene region: {str(e)}")
34
- return None, None
35
 
36
  def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
37
- """
38
- Find mutations with sequence context
39
- """
40
- try:
41
- alignments = pairwise2.align.globalms(ref_seq, query_seq,
42
- match=2,
43
- mismatch=-3,
44
- open=-10,
45
- extend=-0.5)
46
-
47
- if not alignments:
48
- st.warning("No alignments found")
49
- return []
50
-
51
- alignment = alignments[0]
52
- ref_aligned, query_aligned = alignment[0], alignment[1]
53
-
54
- mutations = []
55
- real_pos = 0
56
-
57
- for i in range(len(ref_aligned)):
58
- if ref_aligned[i] != '-':
59
- real_pos += 1
60
-
61
- if ref_aligned[i] != query_aligned[i]:
62
- adj_pos = offset + real_pos
63
- if gene_start <= adj_pos <= gene_end:
64
- mut = {
65
- 'position': adj_pos,
66
- 'gene_position': adj_pos - gene_start + 1,
67
- 'ref_base': ref_aligned[i],
68
- 'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
69
- 'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
70
- 'codon_position': (real_pos - 1) % 3 + 1,
71
- 'context': {
72
- 'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
73
- 'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
74
- }
75
- }
76
- mutations.append(mut)
77
-
78
- return mutations
79
- except Exception as e:
80
- st.error(f"Error in mutation analysis: {str(e)}")
81
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- # Dictionary of important M. tuberculosis genes and their positions
84
- IMPORTANT_GENES = {
85
- 'rpoB': {'start': 759807, 'end': 763325, 'description': 'RNA polymerase β subunit (Rifampicin resistance)'},
86
- 'katG': {'start': 2153889, 'end': 2156111, 'description': 'Catalase-peroxidase (Isoniazid resistance)'},
87
- 'inhA': {'start': 1674202, 'end': 1675011, 'description': 'Enoyl-ACP reductase (Isoniazid resistance)'},
88
- 'gyrA': {'start': 7302, 'end': 9818, 'description': 'DNA gyrase subunit A (Fluoroquinolone resistance)'}
89
- }
90
-
91
- def create_mutation_dataframe(mutations):
92
- """
93
- Convert mutations list to pandas DataFrame
94
- """
95
- if not mutations:
96
- return pd.DataFrame()
97
 
98
- data = []
99
  for mut in mutations:
100
- data.append({
101
- 'Position': mut['position'],
102
- 'Gene Position': mut['gene_position'],
103
- 'Type': mut['type'],
104
- 'Reference': mut['ref_base'],
105
- 'Query': mut['query_base'],
106
- 'Codon Position': mut['codon_position']
107
- })
108
- return pd.DataFrame(data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- def plot_mutation_distribution(df):
111
- """
112
- Create a visualization of mutation distribution
113
- """
114
- if df.empty:
115
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- fig = px.scatter(df,
118
- x='Position',
119
- y='Type',
120
- color='Type',
121
- title='Mutation Distribution',
122
- labels={'Position': 'Genome Position', 'Type': 'Mutation Type'})
123
  return fig
124
 
125
  def main():
126
- st.title("M. tuberculosis Genome Comparison Tool")
127
 
128
  st.markdown("""
129
- This tool compares two M. tuberculosis genomes and identifies mutations in important genes.
130
- Upload your reference genome (typically H37Rv) and your query genome (wild type/clinical isolate) in FASTA format.
 
131
  """)
132
 
133
- # File upload section
134
- col1, col2 = st.columns(2)
135
- with col1:
136
- reference_file = st.file_uploader("Upload Reference Genome (FASTA)", type=['fasta', 'fa'])
137
- with col2:
138
- query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
 
139
 
140
- # Gene selection
141
- selected_gene = st.selectbox(
142
- "Select gene to analyze",
143
- options=list(IMPORTANT_GENES.keys()),
144
- format_func=lambda x: f"{x} - {IMPORTANT_GENES[x]['description']}"
145
- )
146
 
147
- if reference_file and query_file:
148
- if st.button("Analyze Genomes"):
149
- with st.spinner("Analyzing genomes..."):
150
- # Read sequences
151
- ref_genome = read_fasta_from_upload(reference_file)
152
  query_genome = read_fasta_from_upload(query_file)
153
 
154
- if ref_genome and query_genome:
155
- # Get gene coordinates
156
- gene_start = IMPORTANT_GENES[selected_gene]['start']
157
- gene_end = IMPORTANT_GENES[selected_gene]['end']
 
 
158
 
159
- # Extract and analyze gene regions
160
- ref_region, ref_start = extract_gene_region(ref_genome, gene_start, gene_end)
161
- query_region, _ = extract_gene_region(query_genome, gene_start, gene_end)
162
 
163
- if ref_region and query_region:
164
- # Find mutations
165
- mutations = find_mutations_with_context(
166
- ref_region, query_region,
167
- gene_start, gene_end,
168
- ref_start
169
- )
170
-
171
- # Create results section
172
- st.subheader("Analysis Results")
173
-
174
- # Summary statistics
175
- st.markdown("### Summary Statistics")
176
- total_mutations = len(mutations)
177
- snps = len([m for m in mutations if m['type'] == 'SNP'])
178
- indels = len([m for m in mutations if m['type'] == 'INDEL'])
179
-
180
- col1, col2, col3 = st.columns(3)
181
- col1.metric("Total Mutations", total_mutations)
182
- col2.metric("SNPs", snps)
183
- col3.metric("INDELs", indels)
 
184
 
185
- # Convert mutations to DataFrame
186
- df = create_mutation_dataframe(mutations)
 
187
 
188
- if not df.empty:
189
- # Plot mutation distribution
190
- st.plotly_chart(plot_mutation_distribution(df))
191
-
192
- # Detailed mutation table
193
- st.markdown("### Detailed Mutation Analysis")
194
- st.dataframe(df)
195
-
196
- # Download results
197
- csv = df.to_csv(index=False)
198
- st.download_button(
199
- "Download Results CSV",
200
- csv,
201
- "mutations.csv",
202
- "text/csv",
203
- key='download-csv'
204
- )
205
- else:
206
- st.info(f"No mutations found in {selected_gene}")
207
- else:
208
- st.error("Error extracting gene regions")
 
 
 
 
209
  else:
210
- st.error("Error reading genome files")
 
211
 
212
  if __name__ == "__main__":
213
  main()
 
 
1
  import streamlit as st
2
  from Bio import pairwise2
3
  import re
4
  from collections import defaultdict
5
  import pandas as pd
6
  import plotly.express as px
7
+ import plotly.graph_objects as go
8
 
9
+ # Define important gene regions and their associated resistance patterns
10
+ RESISTANCE_GENES = {
11
+ 'rpoB': {
12
+ 'start': 759807,
13
+ 'end': 763325,
14
+ 'description': 'RNA polymerase β subunit',
15
+ 'drug': 'Rifampicin',
16
+ 'mutations': {
17
+ '531': {'from': 'S', 'to': ['L'], 'freq': 'High', 'confidence': 'High'},
18
+ '526': {'from': 'H', 'to': ['Y', 'D', 'R'], 'freq': 'High', 'confidence': 'High'},
19
+ '516': {'from': 'D', 'to': ['V', 'G'], 'freq': 'Moderate', 'confidence': 'High'},
20
+ '511': {'from': 'L', 'to': ['P'], 'freq': 'Low', 'confidence': 'Moderate'}
21
+ }
22
+ },
23
+ 'katG': {
24
+ 'start': 2153889,
25
+ 'end': 2156111,
26
+ 'description': 'Catalase-peroxidase',
27
+ 'drug': 'Isoniazid',
28
+ 'mutations': {
29
+ '315': {'from': 'S', 'to': ['T', 'N'], 'freq': 'High', 'confidence': 'High'},
30
+ '463': {'from': 'R', 'to': ['L'], 'freq': 'Moderate', 'confidence': 'Moderate'}
31
+ }
32
+ },
33
+ 'inhA': {
34
+ 'start': 1674202,
35
+ 'end': 1675011,
36
+ 'description': 'Enoyl-ACP reductase',
37
+ 'drug': 'Isoniazid/Ethionamide',
38
+ 'mutations': {
39
+ '-15': {'from': 'C', 'to': ['T'], 'freq': 'High', 'confidence': 'High'},
40
+ '94': {'from': 'S', 'to': ['A'], 'freq': 'Moderate', 'confidence': 'High'}
41
+ }
42
+ },
43
+ 'gyrA': {
44
+ 'start': 7302,
45
+ 'end': 9818,
46
+ 'description': 'DNA gyrase subunit A',
47
+ 'drug': 'Fluoroquinolones',
48
+ 'mutations': {
49
+ '90': {'from': 'A', 'to': ['V'], 'freq': 'High', 'confidence': 'High'},
50
+ '94': {'from': 'D', 'to': ['G', 'A', 'N'], 'freq': 'High', 'confidence': 'High'}
51
+ }
52
+ }
53
+ }
54
+
55
+ def read_fasta_file(file_path):
56
+ """Read a FASTA file from disk"""
57
+ with open(file_path, 'r') as handle:
58
+ content = handle.read().strip()
59
  parts = content.split('\n', 1)
60
  sequence = ''.join(parts[1].split('\n')).replace(' ', '')
61
  return sequence.upper()
62
+
63
+ def read_fasta_from_upload(uploaded_file):
64
+ """Read a FASTA file from Streamlit upload"""
65
+ content = uploaded_file.getvalue().decode('utf-8').strip()
66
+ parts = content.split('\n', 1)
67
+ sequence = ''.join(parts[1].split('\n')).replace(' ', '')
68
+ return sequence.upper()
69
 
70
  def extract_gene_region(genome_seq, gene_start, gene_end):
71
+ """Extract a gene region with additional context"""
72
+ flank = 200
73
+ start = max(0, gene_start - flank)
74
+ end = min(len(genome_seq), gene_end + flank)
75
+ return genome_seq[start:end], start
 
 
 
 
 
 
76
 
77
  def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
78
+ """Find mutations with sequence context"""
79
+ alignments = pairwise2.align.globalms(ref_seq, query_seq,
80
+ match=2,
81
+ mismatch=-3,
82
+ open=-10,
83
+ extend=-0.5)
84
+
85
+ if not alignments:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  return []
87
+
88
+ alignment = alignments[0]
89
+ ref_aligned, query_aligned = alignment[0], alignment[1]
90
+
91
+ mutations = []
92
+ real_pos = 0
93
+
94
+ for i in range(len(ref_aligned)):
95
+ if ref_aligned[i] != '-':
96
+ real_pos += 1
97
+
98
+ if ref_aligned[i] != query_aligned[i]:
99
+ adj_pos = offset + real_pos
100
+ if gene_start <= adj_pos <= gene_end:
101
+ mut = {
102
+ 'position': adj_pos,
103
+ 'gene_position': adj_pos - gene_start + 1,
104
+ 'ref_base': ref_aligned[i],
105
+ 'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
106
+ 'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
107
+ 'codon_position': (real_pos - 1) % 3 + 1,
108
+ 'context': {
109
+ 'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
110
+ 'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
111
+ }
112
+ }
113
+ mutations.append(mut)
114
+
115
+ return mutations
116
 
117
+ def analyze_resistance(mutations, gene_info):
118
+ """Analyze mutations for drug resistance patterns"""
119
+ resistance_found = []
 
 
 
 
 
 
 
 
 
 
 
120
 
 
121
  for mut in mutations:
122
+ codon_pos = str(mut['gene_position'] // 3 + 1)
123
+ if codon_pos in gene_info['mutations']:
124
+ pattern = gene_info['mutations'][codon_pos]
125
+ if mut['ref_base'] == pattern['from'] and mut['query_base'] in pattern['to']:
126
+ resistance_found.append({
127
+ 'position': codon_pos,
128
+ 'change': f"{pattern['from']}{codon_pos}{mut['query_base']}",
129
+ 'frequency': pattern['freq'],
130
+ 'confidence': pattern['confidence']
131
+ })
132
+
133
+ return resistance_found
134
+
135
+ def create_resistance_report(all_results):
136
+ """Create a comprehensive resistance report"""
137
+ report = []
138
+ for gene, results in all_results.items():
139
+ if results['resistance']:
140
+ drug = RESISTANCE_GENES[gene]['drug']
141
+ mutations = results['resistance']
142
+ confidence = max(m['confidence'] for m in mutations)
143
+ report.append({
144
+ 'gene': gene,
145
+ 'drug': drug,
146
+ 'mutations_found': len(mutations),
147
+ 'mutations': mutations,
148
+ 'confidence': confidence
149
+ })
150
+ return report
151
 
152
+ def plot_gene_mutations(mutations_by_gene, genome_length):
153
+ """Create a visualization of mutations across genes"""
154
+ fig = go.Figure()
155
+
156
+ colors = {'rpoB': 'red', 'katG': 'blue', 'inhA': 'green', 'gyrA': 'purple'}
157
+
158
+ for gene in RESISTANCE_GENES:
159
+ gene_info = RESISTANCE_GENES[gene]
160
+ mutations = mutations_by_gene.get(gene, [])
161
+
162
+ # Add gene region
163
+ fig.add_trace(go.Scatter(
164
+ x=[gene_info['start'], gene_info['end']],
165
+ y=[1, 1],
166
+ mode='lines',
167
+ name=f"{gene} ({gene_info['drug']})",
168
+ line=dict(color=colors.get(gene, 'gray'), width=20, dash='solid'),
169
+ ))
170
+
171
+ # Add mutations
172
+ if mutations:
173
+ x_pos = [m['position'] for m in mutations]
174
+ fig.add_trace(go.Scatter(
175
+ x=x_pos,
176
+ y=[1.2] * len(x_pos),
177
+ mode='markers',
178
+ name=f'{gene} mutations',
179
+ marker=dict(color=colors.get(gene, 'gray'), size=10, symbol='star'),
180
+ ))
181
+
182
+ fig.update_layout(
183
+ title="Resistance-associated Mutations",
184
+ xaxis_title="Genome Position",
185
+ yaxis_visible=False,
186
+ showlegend=True,
187
+ height=400,
188
+ margin=dict(l=50, r=50, t=50, b=50)
189
+ )
190
 
 
 
 
 
 
 
191
  return fig
192
 
193
  def main():
194
+ st.title("M. tuberculosis Drug Resistance Analysis")
195
 
196
  st.markdown("""
197
+ ### Automated Drug Resistance Analysis Tool
198
+ Upload your query genome (clinical isolate) in FASTA format for comparison with H37Rv reference.
199
+ The tool will automatically analyze resistance-associated genes and provide a detailed report.
200
  """)
201
 
202
+ # Load reference genome
203
+ try:
204
+ ref_genome = read_fasta_file("NC_000962.3.fasta")
205
+ st.success("Reference genome (H37Rv) loaded successfully")
206
+ except Exception as e:
207
+ st.error(f"Error loading reference genome: {e}")
208
+ return
209
 
210
+ # Query genome upload
211
+ query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
 
 
 
 
212
 
213
+ if query_file:
214
+ if st.button("Analyze Drug Resistance"):
215
+ with st.spinner("Analyzing genome..."):
 
 
216
  query_genome = read_fasta_from_upload(query_file)
217
 
218
+ # Analyze each resistance gene
219
+ all_results = {}
220
+ for gene, info in RESISTANCE_GENES.items():
221
+ # Extract and analyze regions
222
+ ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end'])
223
+ query_region, _ = extract_gene_region(query_genome, info['start'], info['end'])
224
 
225
+ # Find mutations
226
+ mutations = find_mutations_with_context(ref_region, query_region, info['start'], info['end'], ref_start)
 
227
 
228
+ # Analyze resistance patterns
229
+ resistance = analyze_resistance(mutations, info)
230
+
231
+ all_results[gene] = {
232
+ 'mutations': mutations,
233
+ 'resistance': resistance
234
+ }
235
+
236
+ # Generate comprehensive report
237
+ resistance_report = create_resistance_report(all_results)
238
+
239
+ # Display Results
240
+ st.header("Drug Resistance Analysis Results")
241
+
242
+ if resistance_report:
243
+ st.warning("⚠️ Potential drug resistance mutations detected")
244
+
245
+ # Display resistance summary
246
+ for entry in resistance_report:
247
+ st.subheader(f"🧬 {entry['gene']} - {RESISTANCE_GENES[entry['gene']]['drug']}")
248
+ st.write(f"Confidence: {entry['confidence']}")
249
+ st.write(f"Mutations found: {entry['mutations_found']}")
250
 
251
+ # Create detailed mutation table
252
+ mutations_df = pd.DataFrame(entry['mutations'])
253
+ st.dataframe(mutations_df)
254
 
255
+ st.markdown("---")
256
+
257
+ # Visualize mutations
258
+ st.subheader("Mutation Visualization")
259
+ fig = plot_gene_mutations(all_results, len(ref_genome))
260
+ st.plotly_chart(fig)
261
+
262
+ # Clinical interpretation
263
+ st.subheader("Clinical Interpretation")
264
+ st.markdown("""
265
+ - High confidence mutations strongly indicate resistance
266
+ - Multiple mutations in the same gene may indicate high-level resistance
267
+ - Consider phenotypic testing to confirm resistance patterns
268
+ """)
269
+
270
+ # Download results
271
+ report_df = pd.DataFrame(resistance_report)
272
+ csv = report_df.to_csv(index=False)
273
+ st.download_button(
274
+ "Download Detailed Report (CSV)",
275
+ csv,
276
+ "resistance_analysis.csv",
277
+ "text/csv",
278
+ key='download-csv'
279
+ )
280
  else:
281
+ st.success("No known resistance mutations detected")
282
+ st.info("Note: This does not guarantee drug susceptibility. Consider phenotypic testing.")
283
 
284
  if __name__ == "__main__":
285
  main()