Spaces:

lyimo
/

dnaseq

Sleeping

App Files Files Community

lyimo commited on 26 days ago

Commit

38d4316

verified ·

1 Parent(s): bb2e975

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -165

app.py CHANGED Viewed

@@ -54,71 +54,98 @@ RESISTANCE_GENES = {
 def read_fasta_file(file_path):
     """Read a FASTA file from disk"""
-    with open(file_path, 'r') as handle:
-        content = handle.read().strip()
-        parts = content.split('\n', 1)
-        sequence = ''.join(parts[1].split('\n')).replace(' ', '')
-        return sequence.upper()
 def read_fasta_from_upload(uploaded_file):
     """Read a FASTA file from Streamlit upload"""
-    content = uploaded_file.getvalue().decode('utf-8').strip()
-    parts = content.split('\n', 1)
-    sequence = ''.join(parts[1].split('\n')).replace(' ', '')
-    return sequence.upper()
 def extract_gene_region(genome_seq, gene_start, gene_end):
     """Extract a gene region with additional context"""
-    flank = 200
-    start = max(0, gene_start - flank)
-    end = min(len(genome_seq), gene_end + flank)
-    return genome_seq[start:end], start
 def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
     """Find mutations with sequence context"""
-    alignments = pairwise2.align.globalms(ref_seq, query_seq,
-                                        match=2,
-                                        mismatch=-3,
-                                        open=-10,
-                                        extend=-0.5)
-    if not alignments:
-        return []
-    alignment = alignments[0]
-    ref_aligned, query_aligned = alignment[0], alignment[1]
-    mutations = []
-    real_pos = 0
-    for i in range(len(ref_aligned)):
-        if ref_aligned[i] != '-':
-            real_pos += 1
-        if ref_aligned[i] != query_aligned[i]:
-            adj_pos = offset + real_pos
-            if gene_start <= adj_pos <= gene_end:
-                mut = {
-                    'position': adj_pos,
-                    'gene_position': adj_pos - gene_start + 1,
-                    'ref_base': ref_aligned[i],
-                    'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
-                    'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
-                    'codon_position': (real_pos - 1) % 3 + 1,
-                    'context': {
-                        'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
-                        'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
                     }
-                }
-                mutations.append(mut)
-    return mutations
 def analyze_resistance(mutations, gene_info):
     """Analyze mutations for drug resistance patterns"""
     resistance_found = []
     for mut in mutations:
         codon_pos = str(mut['gene_position'] // 3 + 1)
         if codon_pos in gene_info['mutations']:
             pattern = gene_info['mutations'][codon_pos]
@@ -132,154 +159,122 @@ def analyze_resistance(mutations, gene_info):
     return resistance_found
-def create_resistance_report(all_results):
-    """Create a comprehensive resistance report"""
-    report = []
-    for gene, results in all_results.items():
-        if results['resistance']:
-            drug = RESISTANCE_GENES[gene]['drug']
-            mutations = results['resistance']
-            confidence = max(m['confidence'] for m in mutations)
-            report.append({
-                'gene': gene,
-                'drug': drug,
-                'mutations_found': len(mutations),
-                'mutations': mutations,
-                'confidence': confidence
-            })
-    return report
-def plot_gene_mutations(mutations_by_gene, genome_length):
-    """Create a visualization of mutations across genes"""
-    fig = go.Figure()
-    colors = {'rpoB': 'red', 'katG': 'blue', 'inhA': 'green', 'gyrA': 'purple'}
-    for gene in RESISTANCE_GENES:
-        gene_info = RESISTANCE_GENES[gene]
-        mutations = mutations_by_gene.get(gene, [])
-        # Add gene region
-        fig.add_trace(go.Scatter(
-            x=[gene_info['start'], gene_info['end']],
-            y=[1, 1],
-            mode='lines',
-            name=f"{gene} ({gene_info['drug']})",
-            line=dict(color=colors.get(gene, 'gray'), width=20, dash='solid'),
-        ))
-        # Add mutations
-        if mutations:
-            x_pos = [m['position'] for m in mutations]
-            fig.add_trace(go.Scatter(
-                x=x_pos,
-                y=[1.2] * len(x_pos),
-                mode='markers',
-                name=f'{gene} mutations',
-                marker=dict(color=colors.get(gene, 'gray'), size=10, symbol='star'),
-            ))
-    fig.update_layout(
-        title="Resistance-associated Mutations",
-        xaxis_title="Genome Position",
-        yaxis_visible=False,
-        showlegend=True,
-        height=400,
-        margin=dict(l=50, r=50, t=50, b=50)
-    )
-    return fig
 def main():
     st.title("M. tuberculosis Drug Resistance Analysis")
     st.markdown("""
     ### Automated Drug Resistance Analysis Tool
     Upload your query genome (clinical isolate) in FASTA format for comparison with H37Rv reference.
-    The tool will automatically analyze resistance-associated genes and provide a detailed report.
     """)
     # Load reference genome
-    try:
-        ref_genome = read_fasta_file("NC_000962.3.fasta")
-        st.success("Reference genome (H37Rv) loaded successfully")
-    except Exception as e:
-        st.error(f"Error loading reference genome: {e}")
         return
-    # Query genome upload
     query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
     if query_file:
         if st.button("Analyze Drug Resistance"):
-            with st.spinner("Analyzing genome..."):
-                query_genome = read_fasta_from_upload(query_file)
-                # Analyze each resistance gene
                 all_results = {}
-                for gene, info in RESISTANCE_GENES.items():
-                    # Extract and analyze regions
-                    ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end'])
-                    query_region, _ = extract_gene_region(query_genome, info['start'], info['end'])
-                    # Find mutations
-                    mutations = find_mutations_with_context(ref_region, query_region, info['start'], info['end'], ref_start)
-                    # Analyze resistance patterns
-                    resistance = analyze_resistance(mutations, info)
-                    all_results[gene] = {
-                        'mutations': mutations,
-                        'resistance': resistance
-                    }
-                # Generate comprehensive report
-                resistance_report = create_resistance_report(all_results)
                 # Display Results
-                st.header("Drug Resistance Analysis Results")
-                if resistance_report:
-                    st.warning("⚠️ Potential drug resistance mutations detected")
-                    # Display resistance summary
-                    for entry in resistance_report:
-                        st.subheader(f"🧬 {entry['gene']} - {RESISTANCE_GENES[entry['gene']]['drug']}")
-                        st.write(f"Confidence: {entry['confidence']}")
-                        st.write(f"Mutations found: {entry['mutations_found']}")
-                        # Create detailed mutation table
-                        mutations_df = pd.DataFrame(entry['mutations'])
-                        st.dataframe(mutations_df)
-                        st.markdown("---")
-                    # Visualize mutations
-                    st.subheader("Mutation Visualization")
-                    fig = plot_gene_mutations(all_results, len(ref_genome))
-                    st.plotly_chart(fig)
-                    # Clinical interpretation
-                    st.subheader("Clinical Interpretation")
-                    st.markdown("""
-                    - High confidence mutations strongly indicate resistance
-                    - Multiple mutations in the same gene may indicate high-level resistance
-                    - Consider phenotypic testing to confirm resistance patterns
-                    """)
-                    # Download results
-                    report_df = pd.DataFrame(resistance_report)
                     csv = report_df.to_csv(index=False)
                     st.download_button(
-                        "Download Detailed Report (CSV)",
                         csv,
-                        "resistance_analysis.csv",
-                        "text/csv",
-                        key='download-csv'
                     )
-                else:
-                    st.success("No known resistance mutations detected")
-                    st.info("Note: This does not guarantee drug susceptibility. Consider phenotypic testing.")
 if __name__ == "__main__":
     main()

 def read_fasta_file(file_path):
     """Read a FASTA file from disk"""
+    try:
+        with open(file_path, 'r') as handle:
+            content = handle.read().strip()
+            parts = content.split('\n', 1)
+            sequence = ''.join(parts[1].split('\n')).replace(' ', '')
+            return sequence.upper()
+    except Exception as e:
+        st.error(f"Error reading file {file_path}: {str(e)}")
+        return None
 def read_fasta_from_upload(uploaded_file):
     """Read a FASTA file from Streamlit upload"""
+    try:
+        content = uploaded_file.getvalue().decode('utf-8').strip()
+        parts = content.split('\n', 1)
+        sequence = ''.join(parts[1].split('\n')).replace(' ', '')
+        return sequence.upper()
+    except Exception as e:
+        st.error(f"Error reading uploaded file: {str(e)}")
+        return None
 def extract_gene_region(genome_seq, gene_start, gene_end):
     """Extract a gene region with additional context"""
+    try:
+        flank = 200
+        start = max(0, gene_start - flank)
+        end = min(len(genome_seq), gene_end + flank)
+        extracted_seq = genome_seq[start:end]
+        st.write(f"Extracted sequence length: {len(extracted_seq)}bp")
+        return extracted_seq, start
+    except Exception as e:
+        st.error(f"Error extracting gene region: {str(e)}")
+        return None, None
 def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
     """Find mutations with sequence context"""
+    try:
+        st.write(f"Aligning sequences (lengths: ref={len(ref_seq)}, query={len(query_seq)})")
+        alignments = pairwise2.align.globalms(ref_seq, query_seq,
+                                            match=2,
+                                            mismatch=-3,
+                                            open=-10,
+                                            extend=-0.5)
+        if not alignments:
+            st.warning("No alignments found")
+            return []
+        alignment = alignments[0]
+        ref_aligned, query_aligned = alignment[0], alignment[1]
+        st.write(f"Alignment lengths: ref={len(ref_aligned)}, query={len(query_aligned)}")
+        mutations = []
+        real_pos = 0
+        for i in range(len(ref_aligned)):
+            if ref_aligned[i] != '-':
+                real_pos += 1
+            if ref_aligned[i] != query_aligned[i]:
+                adj_pos = offset + real_pos
+                if gene_start <= adj_pos <= gene_end:
+                    mut = {
+                        'position': adj_pos,
+                        'gene_position': adj_pos - gene_start + 1,
+                        'ref_base': ref_aligned[i],
+                        'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
+                        'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
+                        'codon_position': (real_pos - 1) % 3 + 1,
+                        'context': {
+                            'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
+                            'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
+                        }
                     }
+                    mutations.append(mut)
+        st.write(f"Found {len(mutations)} mutations")
+        return mutations
+    except Exception as e:
+        st.error(f"Error in mutation analysis: {str(e)}")
+        return []
 def analyze_resistance(mutations, gene_info):
     """Analyze mutations for drug resistance patterns"""
     resistance_found = []
+    st.write(f"Analyzing {len(mutations)} mutations for resistance patterns")
     for mut in mutations:
+        st.write(f"Mutation at position {mut['position']}: {mut['ref_base']} -> {mut['query_base']}")
         codon_pos = str(mut['gene_position'] // 3 + 1)
         if codon_pos in gene_info['mutations']:
             pattern = gene_info['mutations'][codon_pos]
     return resistance_found
 def main():
     st.title("M. tuberculosis Drug Resistance Analysis")
     st.markdown("""
     ### Automated Drug Resistance Analysis Tool
     Upload your query genome (clinical isolate) in FASTA format for comparison with H37Rv reference.
     """)
+    # Debug mode toggle
+    debug_mode = st.checkbox("Enable debug mode")
     # Load reference genome
+    ref_genome = read_fasta_file("NC_000962.3.fasta")
+    if ref_genome:
+        st.success(f"Reference genome loaded successfully (length: {len(ref_genome)}bp)")
+    else:
+        st.error("Failed to load reference genome")
         return
     query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
     if query_file:
         if st.button("Analyze Drug Resistance"):
+            query_genome = read_fasta_from_upload(query_file)
+            if query_genome:
+                st.success(f"Query genome loaded successfully (length: {len(query_genome)}bp)")
+                # Analysis progress tracking
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                # Store all results
                 all_results = {}
+                # Analyze each gene
+                for i, (gene, info) in enumerate(RESISTANCE_GENES.items()):
+                    status_text.text(f"Analyzing {gene} ({info['drug']})...")
+                    progress_bar.progress((i + 1) / len(RESISTANCE_GENES))
+                    if debug_mode:
+                        st.subheader(f"Analyzing {gene}")
+                        st.write(f"Gene region: {info['start']}-{info['end']}")
+                    # Extract regions
+                    ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end'])
+                    query_region, _ = extract_gene_region(query_genome, info['start'], info['end'])
+                    if ref_region and query_region:
+                        # Find mutations
+                        mutations = find_mutations_with_context(
+                            ref_region, query_region,
+                            info['start'], info['end'],
+                            ref_start
+                        )
+                        # Analyze resistance
+                        resistance = analyze_resistance(mutations, info)
+                        all_results[gene] = {
+                            'mutations': mutations,
+                            'resistance': resistance
+                        }
+                        if debug_mode:
+                            st.write(f"Found {len(mutations)} mutations")
+                            st.write(f"Identified {len(resistance)} resistance patterns")
+                    else:
+                        st.error(f"Failed to analyze {gene}")
+                # Clear progress indicators
+                progress_bar.empty()
+                status_text.empty()
                 # Display Results
+                st.header("Analysis Results")
+                # Show results for each gene
+                for gene, results in all_results.items():
+                    st.subheader(f"{gene} Analysis")
+                    info = RESISTANCE_GENES[gene]
+                    st.write(f"Drug: {info['drug']}")
+                    st.write(f"Total mutations found: {len(results['mutations'])}")
+                    if results['mutations']:
+                        mutations_df = pd.DataFrame(results['mutations'])
+                        st.write("All mutations found:")
+                        st.dataframe(mutations_df)
+                    if results['resistance']:
+                        st.warning(f"Potential resistance mutations found in {gene}")
+                        resistance_df = pd.DataFrame(results['resistance'])
+                        st.dataframe(resistance_df)
+                    else:
+                        st.info(f"No known resistance mutations found in {gene}")
+                # Download complete results
+                if st.button("Download Complete Analysis"):
+                    # Create detailed report DataFrame
+                    report_data = []
+                    for gene, results in all_results.items():
+                        for mut in results['mutations']:
+                            report_data.append({
+                                'Gene': gene,
+                                'Drug': RESISTANCE_GENES[gene]['drug'],
+                                **mut
+                            })
+                    report_df = pd.DataFrame(report_data)
                     csv = report_df.to_csv(index=False)
                     st.download_button(
+                        "Download Full Report (CSV)",
                         csv,
+                        "mtb_analysis_report.csv",
+                        "text/csv"
                     )
 if __name__ == "__main__":
     main()