lyimo commited on
Commit
38d4316
·
verified ·
1 Parent(s): bb2e975

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -165
app.py CHANGED
@@ -54,71 +54,98 @@ RESISTANCE_GENES = {
54
 
55
  def read_fasta_file(file_path):
56
  """Read a FASTA file from disk"""
57
- with open(file_path, 'r') as handle:
58
- content = handle.read().strip()
59
- parts = content.split('\n', 1)
60
- sequence = ''.join(parts[1].split('\n')).replace(' ', '')
61
- return sequence.upper()
 
 
 
 
62
 
63
  def read_fasta_from_upload(uploaded_file):
64
  """Read a FASTA file from Streamlit upload"""
65
- content = uploaded_file.getvalue().decode('utf-8').strip()
66
- parts = content.split('\n', 1)
67
- sequence = ''.join(parts[1].split('\n')).replace(' ', '')
68
- return sequence.upper()
 
 
 
 
69
 
70
  def extract_gene_region(genome_seq, gene_start, gene_end):
71
  """Extract a gene region with additional context"""
72
- flank = 200
73
- start = max(0, gene_start - flank)
74
- end = min(len(genome_seq), gene_end + flank)
75
- return genome_seq[start:end], start
 
 
 
 
 
 
76
 
77
  def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
78
  """Find mutations with sequence context"""
79
- alignments = pairwise2.align.globalms(ref_seq, query_seq,
80
- match=2,
81
- mismatch=-3,
82
- open=-10,
83
- extend=-0.5)
84
-
85
- if not alignments:
86
- return []
87
-
88
- alignment = alignments[0]
89
- ref_aligned, query_aligned = alignment[0], alignment[1]
90
-
91
- mutations = []
92
- real_pos = 0
93
-
94
- for i in range(len(ref_aligned)):
95
- if ref_aligned[i] != '-':
96
- real_pos += 1
97
-
98
- if ref_aligned[i] != query_aligned[i]:
99
- adj_pos = offset + real_pos
100
- if gene_start <= adj_pos <= gene_end:
101
- mut = {
102
- 'position': adj_pos,
103
- 'gene_position': adj_pos - gene_start + 1,
104
- 'ref_base': ref_aligned[i],
105
- 'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
106
- 'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
107
- 'codon_position': (real_pos - 1) % 3 + 1,
108
- 'context': {
109
- 'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
110
- 'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
 
 
 
 
 
 
 
111
  }
112
- }
113
- mutations.append(mut)
114
-
115
- return mutations
 
 
 
116
 
117
  def analyze_resistance(mutations, gene_info):
118
  """Analyze mutations for drug resistance patterns"""
119
  resistance_found = []
120
 
 
 
121
  for mut in mutations:
 
122
  codon_pos = str(mut['gene_position'] // 3 + 1)
123
  if codon_pos in gene_info['mutations']:
124
  pattern = gene_info['mutations'][codon_pos]
@@ -132,154 +159,122 @@ def analyze_resistance(mutations, gene_info):
132
 
133
  return resistance_found
134
 
135
- def create_resistance_report(all_results):
136
- """Create a comprehensive resistance report"""
137
- report = []
138
- for gene, results in all_results.items():
139
- if results['resistance']:
140
- drug = RESISTANCE_GENES[gene]['drug']
141
- mutations = results['resistance']
142
- confidence = max(m['confidence'] for m in mutations)
143
- report.append({
144
- 'gene': gene,
145
- 'drug': drug,
146
- 'mutations_found': len(mutations),
147
- 'mutations': mutations,
148
- 'confidence': confidence
149
- })
150
- return report
151
-
152
- def plot_gene_mutations(mutations_by_gene, genome_length):
153
- """Create a visualization of mutations across genes"""
154
- fig = go.Figure()
155
-
156
- colors = {'rpoB': 'red', 'katG': 'blue', 'inhA': 'green', 'gyrA': 'purple'}
157
-
158
- for gene in RESISTANCE_GENES:
159
- gene_info = RESISTANCE_GENES[gene]
160
- mutations = mutations_by_gene.get(gene, [])
161
-
162
- # Add gene region
163
- fig.add_trace(go.Scatter(
164
- x=[gene_info['start'], gene_info['end']],
165
- y=[1, 1],
166
- mode='lines',
167
- name=f"{gene} ({gene_info['drug']})",
168
- line=dict(color=colors.get(gene, 'gray'), width=20, dash='solid'),
169
- ))
170
-
171
- # Add mutations
172
- if mutations:
173
- x_pos = [m['position'] for m in mutations]
174
- fig.add_trace(go.Scatter(
175
- x=x_pos,
176
- y=[1.2] * len(x_pos),
177
- mode='markers',
178
- name=f'{gene} mutations',
179
- marker=dict(color=colors.get(gene, 'gray'), size=10, symbol='star'),
180
- ))
181
-
182
- fig.update_layout(
183
- title="Resistance-associated Mutations",
184
- xaxis_title="Genome Position",
185
- yaxis_visible=False,
186
- showlegend=True,
187
- height=400,
188
- margin=dict(l=50, r=50, t=50, b=50)
189
- )
190
-
191
- return fig
192
-
193
  def main():
194
  st.title("M. tuberculosis Drug Resistance Analysis")
195
 
196
  st.markdown("""
197
  ### Automated Drug Resistance Analysis Tool
198
  Upload your query genome (clinical isolate) in FASTA format for comparison with H37Rv reference.
199
- The tool will automatically analyze resistance-associated genes and provide a detailed report.
200
  """)
201
 
 
 
 
202
  # Load reference genome
203
- try:
204
- ref_genome = read_fasta_file("NC_000962.3.fasta")
205
- st.success("Reference genome (H37Rv) loaded successfully")
206
- except Exception as e:
207
- st.error(f"Error loading reference genome: {e}")
208
  return
209
 
210
- # Query genome upload
211
  query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
212
 
213
  if query_file:
214
  if st.button("Analyze Drug Resistance"):
215
- with st.spinner("Analyzing genome..."):
216
- query_genome = read_fasta_from_upload(query_file)
 
 
 
 
 
217
 
218
- # Analyze each resistance gene
219
  all_results = {}
220
- for gene, info in RESISTANCE_GENES.items():
221
- # Extract and analyze regions
222
- ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end'])
223
- query_region, _ = extract_gene_region(query_genome, info['start'], info['end'])
 
224
 
225
- # Find mutations
226
- mutations = find_mutations_with_context(ref_region, query_region, info['start'], info['end'], ref_start)
 
227
 
228
- # Analyze resistance patterns
229
- resistance = analyze_resistance(mutations, info)
 
230
 
231
- all_results[gene] = {
232
- 'mutations': mutations,
233
- 'resistance': resistance
234
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
 
236
- # Generate comprehensive report
237
- resistance_report = create_resistance_report(all_results)
 
238
 
239
  # Display Results
240
- st.header("Drug Resistance Analysis Results")
241
 
242
- if resistance_report:
243
- st.warning("⚠️ Potential drug resistance mutations detected")
 
 
244
 
245
- # Display resistance summary
246
- for entry in resistance_report:
247
- st.subheader(f"🧬 {entry['gene']} - {RESISTANCE_GENES[entry['gene']]['drug']}")
248
- st.write(f"Confidence: {entry['confidence']}")
249
- st.write(f"Mutations found: {entry['mutations_found']}")
250
-
251
- # Create detailed mutation table
252
- mutations_df = pd.DataFrame(entry['mutations'])
253
- st.dataframe(mutations_df)
254
-
255
- st.markdown("---")
256
 
257
- # Visualize mutations
258
- st.subheader("Mutation Visualization")
259
- fig = plot_gene_mutations(all_results, len(ref_genome))
260
- st.plotly_chart(fig)
261
 
262
- # Clinical interpretation
263
- st.subheader("Clinical Interpretation")
264
- st.markdown("""
265
- - High confidence mutations strongly indicate resistance
266
- - Multiple mutations in the same gene may indicate high-level resistance
267
- - Consider phenotypic testing to confirm resistance patterns
268
- """)
 
 
 
 
 
 
 
 
 
 
 
269
 
270
- # Download results
271
- report_df = pd.DataFrame(resistance_report)
272
  csv = report_df.to_csv(index=False)
273
  st.download_button(
274
- "Download Detailed Report (CSV)",
275
  csv,
276
- "resistance_analysis.csv",
277
- "text/csv",
278
- key='download-csv'
279
  )
280
- else:
281
- st.success("No known resistance mutations detected")
282
- st.info("Note: This does not guarantee drug susceptibility. Consider phenotypic testing.")
283
 
284
  if __name__ == "__main__":
285
  main()
 
54
 
55
  def read_fasta_file(file_path):
56
  """Read a FASTA file from disk"""
57
+ try:
58
+ with open(file_path, 'r') as handle:
59
+ content = handle.read().strip()
60
+ parts = content.split('\n', 1)
61
+ sequence = ''.join(parts[1].split('\n')).replace(' ', '')
62
+ return sequence.upper()
63
+ except Exception as e:
64
+ st.error(f"Error reading file {file_path}: {str(e)}")
65
+ return None
66
 
67
  def read_fasta_from_upload(uploaded_file):
68
  """Read a FASTA file from Streamlit upload"""
69
+ try:
70
+ content = uploaded_file.getvalue().decode('utf-8').strip()
71
+ parts = content.split('\n', 1)
72
+ sequence = ''.join(parts[1].split('\n')).replace(' ', '')
73
+ return sequence.upper()
74
+ except Exception as e:
75
+ st.error(f"Error reading uploaded file: {str(e)}")
76
+ return None
77
 
78
  def extract_gene_region(genome_seq, gene_start, gene_end):
79
  """Extract a gene region with additional context"""
80
+ try:
81
+ flank = 200
82
+ start = max(0, gene_start - flank)
83
+ end = min(len(genome_seq), gene_end + flank)
84
+ extracted_seq = genome_seq[start:end]
85
+ st.write(f"Extracted sequence length: {len(extracted_seq)}bp")
86
+ return extracted_seq, start
87
+ except Exception as e:
88
+ st.error(f"Error extracting gene region: {str(e)}")
89
+ return None, None
90
 
91
  def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
92
  """Find mutations with sequence context"""
93
+ try:
94
+ st.write(f"Aligning sequences (lengths: ref={len(ref_seq)}, query={len(query_seq)})")
95
+
96
+ alignments = pairwise2.align.globalms(ref_seq, query_seq,
97
+ match=2,
98
+ mismatch=-3,
99
+ open=-10,
100
+ extend=-0.5)
101
+
102
+ if not alignments:
103
+ st.warning("No alignments found")
104
+ return []
105
+
106
+ alignment = alignments[0]
107
+ ref_aligned, query_aligned = alignment[0], alignment[1]
108
+
109
+ st.write(f"Alignment lengths: ref={len(ref_aligned)}, query={len(query_aligned)}")
110
+
111
+ mutations = []
112
+ real_pos = 0
113
+
114
+ for i in range(len(ref_aligned)):
115
+ if ref_aligned[i] != '-':
116
+ real_pos += 1
117
+
118
+ if ref_aligned[i] != query_aligned[i]:
119
+ adj_pos = offset + real_pos
120
+ if gene_start <= adj_pos <= gene_end:
121
+ mut = {
122
+ 'position': adj_pos,
123
+ 'gene_position': adj_pos - gene_start + 1,
124
+ 'ref_base': ref_aligned[i],
125
+ 'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
126
+ 'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
127
+ 'codon_position': (real_pos - 1) % 3 + 1,
128
+ 'context': {
129
+ 'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
130
+ 'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
131
+ }
132
  }
133
+ mutations.append(mut)
134
+
135
+ st.write(f"Found {len(mutations)} mutations")
136
+ return mutations
137
+ except Exception as e:
138
+ st.error(f"Error in mutation analysis: {str(e)}")
139
+ return []
140
 
141
  def analyze_resistance(mutations, gene_info):
142
  """Analyze mutations for drug resistance patterns"""
143
  resistance_found = []
144
 
145
+ st.write(f"Analyzing {len(mutations)} mutations for resistance patterns")
146
+
147
  for mut in mutations:
148
+ st.write(f"Mutation at position {mut['position']}: {mut['ref_base']} -> {mut['query_base']}")
149
  codon_pos = str(mut['gene_position'] // 3 + 1)
150
  if codon_pos in gene_info['mutations']:
151
  pattern = gene_info['mutations'][codon_pos]
 
159
 
160
  return resistance_found
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  def main():
163
  st.title("M. tuberculosis Drug Resistance Analysis")
164
 
165
  st.markdown("""
166
  ### Automated Drug Resistance Analysis Tool
167
  Upload your query genome (clinical isolate) in FASTA format for comparison with H37Rv reference.
 
168
  """)
169
 
170
+ # Debug mode toggle
171
+ debug_mode = st.checkbox("Enable debug mode")
172
+
173
  # Load reference genome
174
+ ref_genome = read_fasta_file("NC_000962.3.fasta")
175
+ if ref_genome:
176
+ st.success(f"Reference genome loaded successfully (length: {len(ref_genome)}bp)")
177
+ else:
178
+ st.error("Failed to load reference genome")
179
  return
180
 
 
181
  query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
182
 
183
  if query_file:
184
  if st.button("Analyze Drug Resistance"):
185
+ query_genome = read_fasta_from_upload(query_file)
186
+ if query_genome:
187
+ st.success(f"Query genome loaded successfully (length: {len(query_genome)}bp)")
188
+
189
+ # Analysis progress tracking
190
+ progress_bar = st.progress(0)
191
+ status_text = st.empty()
192
 
193
+ # Store all results
194
  all_results = {}
195
+
196
+ # Analyze each gene
197
+ for i, (gene, info) in enumerate(RESISTANCE_GENES.items()):
198
+ status_text.text(f"Analyzing {gene} ({info['drug']})...")
199
+ progress_bar.progress((i + 1) / len(RESISTANCE_GENES))
200
 
201
+ if debug_mode:
202
+ st.subheader(f"Analyzing {gene}")
203
+ st.write(f"Gene region: {info['start']}-{info['end']}")
204
 
205
+ # Extract regions
206
+ ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end'])
207
+ query_region, _ = extract_gene_region(query_genome, info['start'], info['end'])
208
 
209
+ if ref_region and query_region:
210
+ # Find mutations
211
+ mutations = find_mutations_with_context(
212
+ ref_region, query_region,
213
+ info['start'], info['end'],
214
+ ref_start
215
+ )
216
+
217
+ # Analyze resistance
218
+ resistance = analyze_resistance(mutations, info)
219
+
220
+ all_results[gene] = {
221
+ 'mutations': mutations,
222
+ 'resistance': resistance
223
+ }
224
+
225
+ if debug_mode:
226
+ st.write(f"Found {len(mutations)} mutations")
227
+ st.write(f"Identified {len(resistance)} resistance patterns")
228
+ else:
229
+ st.error(f"Failed to analyze {gene}")
230
 
231
+ # Clear progress indicators
232
+ progress_bar.empty()
233
+ status_text.empty()
234
 
235
  # Display Results
236
+ st.header("Analysis Results")
237
 
238
+ # Show results for each gene
239
+ for gene, results in all_results.items():
240
+ st.subheader(f"{gene} Analysis")
241
+ info = RESISTANCE_GENES[gene]
242
 
243
+ st.write(f"Drug: {info['drug']}")
244
+ st.write(f"Total mutations found: {len(results['mutations'])}")
 
 
 
 
 
 
 
 
 
245
 
246
+ if results['mutations']:
247
+ mutations_df = pd.DataFrame(results['mutations'])
248
+ st.write("All mutations found:")
249
+ st.dataframe(mutations_df)
250
 
251
+ if results['resistance']:
252
+ st.warning(f"Potential resistance mutations found in {gene}")
253
+ resistance_df = pd.DataFrame(results['resistance'])
254
+ st.dataframe(resistance_df)
255
+ else:
256
+ st.info(f"No known resistance mutations found in {gene}")
257
+
258
+ # Download complete results
259
+ if st.button("Download Complete Analysis"):
260
+ # Create detailed report DataFrame
261
+ report_data = []
262
+ for gene, results in all_results.items():
263
+ for mut in results['mutations']:
264
+ report_data.append({
265
+ 'Gene': gene,
266
+ 'Drug': RESISTANCE_GENES[gene]['drug'],
267
+ **mut
268
+ })
269
 
270
+ report_df = pd.DataFrame(report_data)
 
271
  csv = report_df.to_csv(index=False)
272
  st.download_button(
273
+ "Download Full Report (CSV)",
274
  csv,
275
+ "mtb_analysis_report.csv",
276
+ "text/csv"
 
277
  )
 
 
 
278
 
279
  if __name__ == "__main__":
280
  main()