Spaces:

lyimo
/

dnaseq

Sleeping

App Files Files Community

lyimo commited on 27 days ago

Commit

a8a77be

verified ·

1 Parent(s): 129decb

Update app.py

Browse files

Files changed (1) hide show

app.py +246 -174

app.py CHANGED Viewed

@@ -1,213 +1,285 @@
-# app.py
 import streamlit as st
 from Bio import pairwise2
 import re
 from collections import defaultdict
 import pandas as pd
 import plotly.express as px
-import io
-def read_fasta_from_upload(uploaded_file):
-    """
-    Read a FASTA file from Streamlit upload
-    """
-    try:
-        content = uploaded_file.getvalue().decode('utf-8').strip()
         parts = content.split('\n', 1)
         sequence = ''.join(parts[1].split('\n')).replace(' ', '')
         return sequence.upper()
-    except Exception as e:
-        st.error(f"Error reading uploaded file: {str(e)}")
-        return None
 def extract_gene_region(genome_seq, gene_start, gene_end):
-    """
-    Extract a gene region with additional context
-    """
-    try:
-        flank = 200
-        start = max(0, gene_start - flank)
-        end = min(len(genome_seq), gene_end + flank)
-        return genome_seq[start:end], start
-    except Exception as e:
-        st.error(f"Error extracting gene region: {str(e)}")
-        return None, None
 def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
-    """
-    Find mutations with sequence context
-    """
-    try:
-        alignments = pairwise2.align.globalms(ref_seq, query_seq,
-                                            match=2,
-                                            mismatch=-3,
-                                            open=-10,
-                                            extend=-0.5)
-        if not alignments:
-            st.warning("No alignments found")
-            return []
-        alignment = alignments[0]
-        ref_aligned, query_aligned = alignment[0], alignment[1]
-        mutations = []
-        real_pos = 0
-        for i in range(len(ref_aligned)):
-            if ref_aligned[i] != '-':
-                real_pos += 1
-            if ref_aligned[i] != query_aligned[i]:
-                adj_pos = offset + real_pos
-                if gene_start <= adj_pos <= gene_end:
-                    mut = {
-                        'position': adj_pos,
-                        'gene_position': adj_pos - gene_start + 1,
-                        'ref_base': ref_aligned[i],
-                        'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
-                        'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
-                        'codon_position': (real_pos - 1) % 3 + 1,
-                        'context': {
-                            'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
-                            'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
-                        }
-                    }
-                    mutations.append(mut)
-        return mutations
-    except Exception as e:
-        st.error(f"Error in mutation analysis: {str(e)}")
         return []
-# Dictionary of important M. tuberculosis genes and their positions
-IMPORTANT_GENES = {
-    'rpoB': {'start': 759807, 'end': 763325, 'description': 'RNA polymerase β subunit (Rifampicin resistance)'},
-    'katG': {'start': 2153889, 'end': 2156111, 'description': 'Catalase-peroxidase (Isoniazid resistance)'},
-    'inhA': {'start': 1674202, 'end': 1675011, 'description': 'Enoyl-ACP reductase (Isoniazid resistance)'},
-    'gyrA': {'start': 7302, 'end': 9818, 'description': 'DNA gyrase subunit A (Fluoroquinolone resistance)'}
-}
-def create_mutation_dataframe(mutations):
-    """
-    Convert mutations list to pandas DataFrame
-    """
-    if not mutations:
-        return pd.DataFrame()
-    data = []
     for mut in mutations:
-        data.append({
-            'Position': mut['position'],
-            'Gene Position': mut['gene_position'],
-            'Type': mut['type'],
-            'Reference': mut['ref_base'],
-            'Query': mut['query_base'],
-            'Codon Position': mut['codon_position']
-        })
-    return pd.DataFrame(data)
-def plot_mutation_distribution(df):
-    """
-    Create a visualization of mutation distribution
-    """
-    if df.empty:
-        return None
-    fig = px.scatter(df,
-                    x='Position',
-                    y='Type',
-                    color='Type',
-                    title='Mutation Distribution',
-                    labels={'Position': 'Genome Position', 'Type': 'Mutation Type'})
     return fig
 def main():
-    st.title("M. tuberculosis Genome Comparison Tool")
     st.markdown("""
-    This tool compares two M. tuberculosis genomes and identifies mutations in important genes.
-    Upload your reference genome (typically H37Rv) and your query genome (wild type/clinical isolate) in FASTA format.
     """)
-    # File upload section
-    col1, col2 = st.columns(2)
-    with col1:
-        reference_file = st.file_uploader("Upload Reference Genome (FASTA)", type=['fasta', 'fa'])
-    with col2:
-        query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
-    # Gene selection
-    selected_gene = st.selectbox(
-        "Select gene to analyze",
-        options=list(IMPORTANT_GENES.keys()),
-        format_func=lambda x: f"{x} - {IMPORTANT_GENES[x]['description']}"
-    )
-    if reference_file and query_file:
-        if st.button("Analyze Genomes"):
-            with st.spinner("Analyzing genomes..."):
-                # Read sequences
-                ref_genome = read_fasta_from_upload(reference_file)
                 query_genome = read_fasta_from_upload(query_file)
-                if ref_genome and query_genome:
-                    # Get gene coordinates
-                    gene_start = IMPORTANT_GENES[selected_gene]['start']
-                    gene_end = IMPORTANT_GENES[selected_gene]['end']
-                    # Extract and analyze gene regions
-                    ref_region, ref_start = extract_gene_region(ref_genome, gene_start, gene_end)
-                    query_region, _ = extract_gene_region(query_genome, gene_start, gene_end)
-                    if ref_region and query_region:
-                        # Find mutations
-                        mutations = find_mutations_with_context(
-                            ref_region, query_region,
-                            gene_start, gene_end,
-                            ref_start
-                        )
-                        # Create results section
-                        st.subheader("Analysis Results")
-                        # Summary statistics
-                        st.markdown("### Summary Statistics")
-                        total_mutations = len(mutations)
-                        snps = len([m for m in mutations if m['type'] == 'SNP'])
-                        indels = len([m for m in mutations if m['type'] == 'INDEL'])
-                        col1, col2, col3 = st.columns(3)
-                        col1.metric("Total Mutations", total_mutations)
-                        col2.metric("SNPs", snps)
-                        col3.metric("INDELs", indels)
-                        # Convert mutations to DataFrame
-                        df = create_mutation_dataframe(mutations)
-                        if not df.empty:
-                            # Plot mutation distribution
-                            st.plotly_chart(plot_mutation_distribution(df))
-                            # Detailed mutation table
-                            st.markdown("### Detailed Mutation Analysis")
-                            st.dataframe(df)
-                            # Download results
-                            csv = df.to_csv(index=False)
-                            st.download_button(
-                                "Download Results CSV",
-                                csv,
-                                "mutations.csv",
-                                "text/csv",
-                                key='download-csv'
-                            )
-                        else:
-                            st.info(f"No mutations found in {selected_gene}")
-                    else:
-                        st.error("Error extracting gene regions")
                 else:
-                    st.error("Error reading genome files")
 if __name__ == "__main__":
     main()

 import streamlit as st
 from Bio import pairwise2
 import re
 from collections import defaultdict
 import pandas as pd
 import plotly.express as px
+import plotly.graph_objects as go
+# Define important gene regions and their associated resistance patterns
+RESISTANCE_GENES = {
+    'rpoB': {
+        'start': 759807,
+        'end': 763325,
+        'description': 'RNA polymerase β subunit',
+        'drug': 'Rifampicin',
+        'mutations': {
+            '531': {'from': 'S', 'to': ['L'], 'freq': 'High', 'confidence': 'High'},
+            '526': {'from': 'H', 'to': ['Y', 'D', 'R'], 'freq': 'High', 'confidence': 'High'},
+            '516': {'from': 'D', 'to': ['V', 'G'], 'freq': 'Moderate', 'confidence': 'High'},
+            '511': {'from': 'L', 'to': ['P'], 'freq': 'Low', 'confidence': 'Moderate'}
+        }
+    },
+    'katG': {
+        'start': 2153889,
+        'end': 2156111,
+        'description': 'Catalase-peroxidase',
+        'drug': 'Isoniazid',
+        'mutations': {
+            '315': {'from': 'S', 'to': ['T', 'N'], 'freq': 'High', 'confidence': 'High'},
+            '463': {'from': 'R', 'to': ['L'], 'freq': 'Moderate', 'confidence': 'Moderate'}
+        }
+    },
+    'inhA': {
+        'start': 1674202,
+        'end': 1675011,
+        'description': 'Enoyl-ACP reductase',
+        'drug': 'Isoniazid/Ethionamide',
+        'mutations': {
+            '-15': {'from': 'C', 'to': ['T'], 'freq': 'High', 'confidence': 'High'},
+            '94': {'from': 'S', 'to': ['A'], 'freq': 'Moderate', 'confidence': 'High'}
+        }
+    },
+    'gyrA': {
+        'start': 7302,
+        'end': 9818,
+        'description': 'DNA gyrase subunit A',
+        'drug': 'Fluoroquinolones',
+        'mutations': {
+            '90': {'from': 'A', 'to': ['V'], 'freq': 'High', 'confidence': 'High'},
+            '94': {'from': 'D', 'to': ['G', 'A', 'N'], 'freq': 'High', 'confidence': 'High'}
+        }
+    }
+}
+def read_fasta_file(file_path):
+    """Read a FASTA file from disk"""
+    with open(file_path, 'r') as handle:
+        content = handle.read().strip()
         parts = content.split('\n', 1)
         sequence = ''.join(parts[1].split('\n')).replace(' ', '')
         return sequence.upper()
+def read_fasta_from_upload(uploaded_file):
+    """Read a FASTA file from Streamlit upload"""
+    content = uploaded_file.getvalue().decode('utf-8').strip()
+    parts = content.split('\n', 1)
+    sequence = ''.join(parts[1].split('\n')).replace(' ', '')
+    return sequence.upper()
 def extract_gene_region(genome_seq, gene_start, gene_end):
+    """Extract a gene region with additional context"""
+    flank = 200
+    start = max(0, gene_start - flank)
+    end = min(len(genome_seq), gene_end + flank)
+    return genome_seq[start:end], start
 def find_mutations_with_context(ref_seq, query_seq, gene_start, gene_end, offset=0):
+    """Find mutations with sequence context"""
+    alignments = pairwise2.align.globalms(ref_seq, query_seq,
+                                        match=2,
+                                        mismatch=-3,
+                                        open=-10,
+                                        extend=-0.5)
+    if not alignments:
         return []
+    alignment = alignments[0]
+    ref_aligned, query_aligned = alignment[0], alignment[1]
+    mutations = []
+    real_pos = 0
+    for i in range(len(ref_aligned)):
+        if ref_aligned[i] != '-':
+            real_pos += 1
+        if ref_aligned[i] != query_aligned[i]:
+            adj_pos = offset + real_pos
+            if gene_start <= adj_pos <= gene_end:
+                mut = {
+                    'position': adj_pos,
+                    'gene_position': adj_pos - gene_start + 1,
+                    'ref_base': ref_aligned[i],
+                    'query_base': query_aligned[i] if query_aligned[i] != '-' else 'None',
+                    'type': 'SNP' if ref_aligned[i] != '-' and query_aligned[i] != '-' else 'INDEL',
+                    'codon_position': (real_pos - 1) % 3 + 1,
+                    'context': {
+                        'ref': ref_aligned[max(0,i-5):i] + '[' + ref_aligned[i] + ']' + ref_aligned[i+1:i+6],
+                        'query': query_aligned[max(0,i-5):i] + '[' + query_aligned[i] + ']' + query_aligned[i+1:i+6]
+                    }
+                }
+                mutations.append(mut)
+    return mutations
+def analyze_resistance(mutations, gene_info):
+    """Analyze mutations for drug resistance patterns"""
+    resistance_found = []
     for mut in mutations:
+        codon_pos = str(mut['gene_position'] // 3 + 1)
+        if codon_pos in gene_info['mutations']:
+            pattern = gene_info['mutations'][codon_pos]
+            if mut['ref_base'] == pattern['from'] and mut['query_base'] in pattern['to']:
+                resistance_found.append({
+                    'position': codon_pos,
+                    'change': f"{pattern['from']}{codon_pos}{mut['query_base']}",
+                    'frequency': pattern['freq'],
+                    'confidence': pattern['confidence']
+                })
+    return resistance_found
+def create_resistance_report(all_results):
+    """Create a comprehensive resistance report"""
+    report = []
+    for gene, results in all_results.items():
+        if results['resistance']:
+            drug = RESISTANCE_GENES[gene]['drug']
+            mutations = results['resistance']
+            confidence = max(m['confidence'] for m in mutations)
+            report.append({
+                'gene': gene,
+                'drug': drug,
+                'mutations_found': len(mutations),
+                'mutations': mutations,
+                'confidence': confidence
+            })
+    return report
+def plot_gene_mutations(mutations_by_gene, genome_length):
+    """Create a visualization of mutations across genes"""
+    fig = go.Figure()
+    colors = {'rpoB': 'red', 'katG': 'blue', 'inhA': 'green', 'gyrA': 'purple'}
+    for gene in RESISTANCE_GENES:
+        gene_info = RESISTANCE_GENES[gene]
+        mutations = mutations_by_gene.get(gene, [])
+        # Add gene region
+        fig.add_trace(go.Scatter(
+            x=[gene_info['start'], gene_info['end']],
+            y=[1, 1],
+            mode='lines',
+            name=f"{gene} ({gene_info['drug']})",
+            line=dict(color=colors.get(gene, 'gray'), width=20, dash='solid'),
+        ))
+        # Add mutations
+        if mutations:
+            x_pos = [m['position'] for m in mutations]
+            fig.add_trace(go.Scatter(
+                x=x_pos,
+                y=[1.2] * len(x_pos),
+                mode='markers',
+                name=f'{gene} mutations',
+                marker=dict(color=colors.get(gene, 'gray'), size=10, symbol='star'),
+            ))
+    fig.update_layout(
+        title="Resistance-associated Mutations",
+        xaxis_title="Genome Position",
+        yaxis_visible=False,
+        showlegend=True,
+        height=400,
+        margin=dict(l=50, r=50, t=50, b=50)
+    )
     return fig
 def main():
+    st.title("M. tuberculosis Drug Resistance Analysis")
     st.markdown("""
+    ### Automated Drug Resistance Analysis Tool
+    Upload your query genome (clinical isolate) in FASTA format for comparison with H37Rv reference.
+    The tool will automatically analyze resistance-associated genes and provide a detailed report.
     """)
+    # Load reference genome
+    try:
+        ref_genome = read_fasta_file("NC_000962.3.fasta")
+        st.success("Reference genome (H37Rv) loaded successfully")
+    except Exception as e:
+        st.error(f"Error loading reference genome: {e}")
+        return
+    # Query genome upload
+    query_file = st.file_uploader("Upload Query Genome (FASTA)", type=['fasta', 'fa'])
+    if query_file:
+        if st.button("Analyze Drug Resistance"):
+            with st.spinner("Analyzing genome..."):
                 query_genome = read_fasta_from_upload(query_file)
+                # Analyze each resistance gene
+                all_results = {}
+                for gene, info in RESISTANCE_GENES.items():
+                    # Extract and analyze regions
+                    ref_region, ref_start = extract_gene_region(ref_genome, info['start'], info['end'])
+                    query_region, _ = extract_gene_region(query_genome, info['start'], info['end'])
+                    # Find mutations
+                    mutations = find_mutations_with_context(ref_region, query_region, info['start'], info['end'], ref_start)
+                    # Analyze resistance patterns
+                    resistance = analyze_resistance(mutations, info)
+                    all_results[gene] = {
+                        'mutations': mutations,
+                        'resistance': resistance
+                    }
+                # Generate comprehensive report
+                resistance_report = create_resistance_report(all_results)
+                # Display Results
+                st.header("Drug Resistance Analysis Results")
+                if resistance_report:
+                    st.warning("⚠️ Potential drug resistance mutations detected")
+                    # Display resistance summary
+                    for entry in resistance_report:
+                        st.subheader(f"🧬 {entry['gene']} - {RESISTANCE_GENES[entry['gene']]['drug']}")
+                        st.write(f"Confidence: {entry['confidence']}")
+                        st.write(f"Mutations found: {entry['mutations_found']}")
+                        # Create detailed mutation table
+                        mutations_df = pd.DataFrame(entry['mutations'])
+                        st.dataframe(mutations_df)
+                        st.markdown("---")
+                    # Visualize mutations
+                    st.subheader("Mutation Visualization")
+                    fig = plot_gene_mutations(all_results, len(ref_genome))
+                    st.plotly_chart(fig)
+                    # Clinical interpretation
+                    st.subheader("Clinical Interpretation")
+                    st.markdown("""
+                    - High confidence mutations strongly indicate resistance
+                    - Multiple mutations in the same gene may indicate high-level resistance
+                    - Consider phenotypic testing to confirm resistance patterns
+                    """)
+                    # Download results
+                    report_df = pd.DataFrame(resistance_report)
+                    csv = report_df.to_csv(index=False)
+                    st.download_button(
+                        "Download Detailed Report (CSV)",
+                        csv,
+                        "resistance_analysis.csv",
+                        "text/csv",
+                        key='download-csv'
+                    )
                 else:
+                    st.success("No known resistance mutations detected")
+                    st.info("Note: This does not guarantee drug susceptibility. Consider phenotypic testing.")
 if __name__ == "__main__":
     main()