Spaces:
Sleeping
Sleeping
import streamlit as st | |
import zipfile | |
import urllib.request | |
import glob | |
import SigProfilerMatrixGenerator | |
from SigProfilerMatrixGenerator import install as genInstall | |
import shutil | |
import os | |
import re | |
from SigProfilerExtractor import sigpro as sig | |
from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as datadump | |
import sys | |
import numpy as np | |
import pandas as pd | |
import base64 | |
import streamlit.components.v1 as components | |
from liftover import get_lifter | |
converter = get_lifter('hg38', 'hg19') | |
curdir= os.getcwd() | |
def remove_old_vcf(): | |
vcfrem=glob.glob('input/*.vcf') | |
for filepath in vcfrem: | |
os.remove(filepath) | |
vcfrem=glob.glob('input/input/*.vcf') | |
for filepath in vcfrem: | |
os.remove(filepath) | |
def show_pdf(file_path): | |
with open(file_path,"rb") as f: | |
base64_pdf = base64.b64encode(f.read()).decode('utf-8') | |
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="1500" height="1000" type="application/pdf"></iframe>' | |
st.markdown(pdf_display, unsafe_allow_html=True) | |
def showdl_counts(file_to_lookat,to_dl_sbs96,to_dl_sbs1536): | |
for j in np.arange(0,len(to_dl_sbs96)): | |
if to_dl_sbs96[j] != []: | |
download_link1 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_sbs96[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}96SBS.txt">Download {file_to_lookat[j].name} Single Base Substition (96) table</a>' | |
download_link2 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_sbs1536[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}1536SBS.txt">Download {file_to_lookat[j].name} Single Base Substition (1536) table</a>' | |
st.markdown(download_link1, unsafe_allow_html=True) | |
st.markdown(download_link2, unsafe_allow_html=True) | |
#@st.cache_data(experimental_allow_widgets=True) | |
def showdl(file_to_lookat,to_dl_sbs,to_dl_indel,to_dl_dbs,to_dl_sbs_text,to_dl_indel_text,to_dl_dbs_text,to_dl_sbs_summary_text,to_dl_id_summary_text,to_dl_dbs_summary_text): | |
for j in np.arange(0,len(to_dl_sbs)): | |
if to_dl_sbs[j] != []: | |
download_link1 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_sbs[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}SBS.pdf">Download {file_to_lookat[j].name} Single Base Substition pdf</a>' | |
download_link2 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_sbs_text[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}SBS.txt">Download {file_to_lookat[j].name} Single Base Substition table</a>' | |
download_link3 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_sbs_summary_text[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}SBS_summary.txt">Download {file_to_lookat[j].name} Summary Single Base Substition table</a>' | |
st.markdown(download_link1, unsafe_allow_html=True) | |
st.markdown(download_link2, unsafe_allow_html=True) | |
st.markdown(download_link3, unsafe_allow_html=True) | |
for j in np.arange(0,len(to_dl_indel)): | |
if to_dl_indel[j] != []: | |
download_link4 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_indel[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}Indel.pdf">Download {file_to_lookat[j].name} indel pdf</a>' | |
download_link5 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_indel_text[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}Indel.txt">Download {file_to_lookat[j].name} indel table</a>' | |
download_link6 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_id_summary_text[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}Indel.txt">Download {file_to_lookat[j].name} summary indel table</a>' | |
st.markdown(download_link4, unsafe_allow_html=True) | |
st.markdown(download_link5, unsafe_allow_html=True) | |
st.markdown(download_link6, unsafe_allow_html=True) | |
for j in np.arange(0,len(to_dl_dbs)): | |
if to_dl_dbs[j] !=[]: | |
download_link7 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_dbs[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}DBS.pdf">Download {file_to_lookat[j].name} Double Base Substitution pdf</a>' | |
download_link8 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_dbs_text[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}DBS.txt">Download {file_to_lookat[j].name} Double Base Substitution table</a>' | |
download_link9 = f'<a href="data:application/octet-stream;base64, \ | |
{base64.b64encode(to_dl_dbs_summary_text[j]).decode()}" download=" \ | |
{file_to_lookat[j].name}DBS.txt">Download {file_to_lookat[j].name} summary Double Base Substitution table</a>' | |
st.markdown(download_link7, unsafe_allow_html=True) | |
st.markdown(download_link8, unsafe_allow_html=True) | |
st.markdown(download_link9, unsafe_allow_html=True) | |
#st.download_button(label="Download image with single base substitution profiles", key=j, | |
# data=to_dl_sbs[j], | |
# file_name="SBS.pdf", | |
# mime='application/octet-stream') | |
#st.download_button(label="Download image with indel profiles", key=0.5+j, | |
# data=to_dl_indel[j], | |
# file_name="idel.pdf", | |
# mime='application/octet-stream') | |
def dl_counts(valforkey): | |
with open("sbs96.txt","rb") as txt_file: | |
sbs96_all_bytes = txt_file.read() | |
txt_file.close() | |
with open("sbs1536.txt","rb") as txt_file: | |
sbs1536_all_bytes = txt_file.read() | |
txt_file.close() | |
return sbs96_all_bytes, sbs1536_all_bytes | |
def dl(valforkey): | |
#breakpoint() | |
seev=glob.glob('output/SBS96/Suggested_Solution/COSMIC_SBS96_Decomposed_Solution/*pdf') | |
for i in seev: | |
st.write('pdf file with sbs96 output is here: '+i) | |
with open("output/SBS96/Suggested_Solution/COSMIC_SBS96_Decomposed_Solution/SBS96_Decomposition_Plots.pdf", "rb") as pdf_file: | |
PDFbyte1 = pdf_file.read() | |
with open("output/SBS96/Samples.txt","rb") as txt_file: | |
Txtbyte1 = txt_file.read() | |
txt_file.close() | |
hh=pd.read_table('output/SBS96/Samples.txt') | |
hh['nums']=hh.iloc[:,1] | |
hh['mutation_simple']=hh['MutationType'].apply(lambda x: re.sub('].$','',re.sub('^.\[','',x))) | |
summary_table_sbs_all=hh.groupby('mutation_simple').apply(lambda x: x.sum())[['nums']] | |
summary_table_sbs_all.to_csv('sbs_summary.txt',sep='\t',header=False,index=True) | |
with open("sbs_summary.txt","rb") as txt_file: | |
summary_table_sbs_all_bytes = txt_file.read() | |
txt_file.close() | |
#st.download_button(label="Download image with single base substitution profiles", key=valforkey, | |
# data=PDFbyte1, | |
# file_name="SBS.pdf", | |
# mime='application/octet-stream') | |
if glob.glob('output/ID83/Suggested_Solution/COSMIC_ID83_Decomposed_Solution/ID83_Decomposition_Plots.pdf') != []: | |
with open("output/ID83/Suggested_Solution/COSMIC_ID83_Decomposed_Solution/ID83_Decomposition_Plots.pdf", "rb") as pdf_file: | |
PDFbyte2 = pdf_file.read() | |
with open("output/ID83/Samples.txt","rb") as txt_file: | |
Txtbyte2 = txt_file.read() | |
hh2=pd.read_table('output/ID83/Samples.txt') | |
hh2['nums']=hh2.iloc[:,1] | |
hh2['mutation_simple']=hh2['MutationType'].apply(lambda x: re.sub('].$','',re.sub('^.\[','',x))) | |
summary_table_id_all=hh2.groupby('mutation_simple').apply(lambda x: x.sum())[['nums']] | |
summary_table_id_all.to_csv('id_summary.txt',sep='\t',header=False,index=True) | |
with open("id_summary.txt","rb") as txt_file: | |
summary_table_id_all_bytes = txt_file.read() | |
txt_file.close() | |
else: | |
PDFbyte2 = [] | |
Txtbyte2 = [] | |
summary_table_id_all_bytes=[] | |
if glob.glob("output/DBS78/Suggested_Solution/COSMIC_DBS78_Decomposed_Solution/DBS78_Decomposition_Plots.pdf") != []: | |
with open("output/DBS78/Suggested_Solution/COSMIC_DBS78_Decomposed_Solution/DBS78_Decomposition_Plots.pdf", "rb") as pdf_file: | |
PDFbyte3 = pdf_file.read() | |
with open("output/DBS78/Samples.txt","rb") as txt_file: | |
Txtbyte3 = txt_file.read() | |
hh3=pd.read_table('output/DBS78/Samples.txt') | |
hh3['nums']=hh3.iloc[:,1] | |
hh3['mutation_simple']=hh3['MutationType'].apply(lambda x: re.sub('].$','',re.sub('^.\[','',x))) | |
summary_table_dbs_all=hh3.groupby('mutation_simple').apply(lambda x: x.sum())[['nums']] | |
summary_table_dbs_all.to_csv('dbs_summary.txt',sep='\t',header=False,index=True) | |
with open("dbs_summary.txt","rb") as txt_file: | |
summary_table_dbs_all_bytes = txt_file.read() | |
txt_file.close() | |
else: | |
PDFbyte3 = [] | |
Txtbyte3=[] | |
summary_table_dbs_all_bytes=[] | |
os.system('rm -r output') | |
os.system('rm -r input') | |
#os.remove("output/SBS96/Suggested_Solution/COSMIC_SBS96_Decomposed_Solution/SBS96_Decomposition_Plots.pdf") | |
#os.remove("output/SBS96/Suggested_Solution/COSMIC_SBS96_Decomposed_Solution/*") | |
#os.remove("output/SBS96/Samples.txt") | |
#os.remove("output/ID83/Suggested_Solution/COSMIC_ID83_Decomposed_Solution/ID83_Decomposition_Plots.pdf") | |
#os.remove("output/ID83/Samples.txt") | |
#os.remove("output/DBS78/Suggested_Solution/COSMIC_DBS78_Decomposed_Solution/DBS78_Decomposition_Plots.pdf") | |
#os.remove("output/DBS78/Samples.txt") | |
#st.download_button(label="Download image with indel profiles", key=0.5+valforkey, | |
# data=PDFbyte2, | |
# file_name="idel.pdf", | |
# mime='application/octet-stream') | |
return PDFbyte1,PDFbyte2,PDFbyte3,Txtbyte1,Txtbyte2,Txtbyte3, summary_table_sbs_all_bytes, summary_table_id_all_bytes,summary_table_dbs_all_bytes | |
#st.write(glob.glob(os.path.join(os.path.dirname(SigProfilerMatrixGenerator.__file__),'references/*txt'))) | |
with st.form('get signature'): | |
if glob.glob(os.path.join(os.path.dirname(SigProfilerMatrixGenerator.__file__),'references/chromosomes/tsb/GRCh3[78]/')+'*txt') == []: | |
st.write('There is no reference genome, we need to download this') | |
refdownload=True | |
else: | |
st.write('using reference from here:'+glob.glob(os.path.join(os.path.dirname(SigProfilerMatrixGenerator.__file__),'references/chromosomes/tsb/GRCh3[78]/'))[0]) | |
refdownload=False | |
no_profiles_only_counts = st.radio('Do Not Perform COSMIC profile analysis',[False, True]) | |
referencegenome =st.radio('reference',['hg19','GRCh38']) | |
file_to_lookat=st.file_uploader('VCF upload here',type=[".vcf","xlsx"],accept_multiple_files=True) | |
remove_old_vcf() | |
sub=st.form_submit_button('submit input') | |
if file_to_lookat !=[] and sub: | |
#if st.button('get reference genome'): | |
#st.write(os.path.dirname(SigProfilerMatrixGenerator.__file__)) | |
dirtest=os.path.dirname(SigProfilerMatrixGenerator.__file__) | |
#st.write(sys.path) | |
if refdownload==True: | |
if referencegenome=='GRCh38': | |
st.write('using liftover with hg19 instead of downloading Grch38') | |
with st.spinner('downloading hg19 reference'): | |
urllib.request.urlretrieve('https://dl.dropboxusercontent.com/s/et97ewsct862x7m/references.zip?dl=0','references.zip') | |
with zipfile.ZipFile('references.zip', 'r') as zip_ref: | |
zip_ref.extractall(dirtest) | |
#elif refdownload==True and referencegenome=='GRCh38': | |
#with st.spinner('downloading GRCh38 reference'): | |
# genInstall.install('GRCh38') | |
#seev=glob.glob(dirtest+'/references/chromosomes/tsb/GRCh37/*txt') | |
#for i in seev: | |
# st.write(i) | |
##genInstall.install('GRCh37') | |
to_dl_sbs=[] | |
to_dl_indel=[] | |
to_dl_dbs=[] | |
to_dl_sbs_text=[] | |
to_dl_indel_text=[] | |
to_dl_dbs_text=[] | |
to_dl_sbs_summary_text=[] | |
to_dl_id_summary_text=[] | |
to_dl_dbs_summary_text=[] | |
to_dl_sbs96=[] | |
to_dl_sbs1536=[] | |
for j in np.arange(0,len(file_to_lookat)): | |
if not os.path.exists('input'): | |
os.mkdir('input') | |
if not os.path.exists('input/input'): | |
os.mkdir('input/input') | |
remove_old_vcf() | |
if re.findall('vcf$',file_to_lookat[j].name) != []: | |
bytes_data=file_to_lookat[j].read() | |
with open(os.path.join("input",file_to_lookat[j].name),"wb") as f: | |
f.write(bytes_data) | |
f.close() | |
seev=glob.glob('input/*') | |
else: | |
table_of_penn_file=pd.read_excel(file_to_lookat[j]) | |
tovcf=pd.DataFrame() | |
table_of_penn_file=table_of_penn_file.iloc[1:,:] | |
tt=table_of_penn_file[['Chrom','Pos']].apply(lambda x: converter[x[0]][int(x[1])][0],axis=1) | |
table_of_penn_file['Chrom']= [a[0] for a in tt] | |
table_of_penn_file['Pos']= [a[1] for a in tt] | |
tovcf['Chrom']=table_of_penn_file['Chrom'] | |
tovcf['Pos']=table_of_penn_file['Pos'] | |
tovcf['db']='.' | |
tovcf['ref']=table_of_penn_file['Ref'] | |
tovcf['alt']=table_of_penn_file['Alt'] | |
nameuse=re.sub('xlsx$','vcf',file_to_lookat[j].name) | |
tovcf.to_csv(nameuse,sep='\t',header=False, index=False) | |
st.write('file after liftover:') | |
st.write(tovcf) | |
#st.write(nameuse) | |
with open(nameuse,"rb") as txt_file: | |
bytes_data=txt_file.read() | |
txt_file.close() | |
with open(os.path.join("input",nameuse),"wb") as f: | |
f.write(bytes_data) | |
f.close() | |
seev=glob.glob('input/*') | |
#st.write(pd.read_csv(os.path.join("input",nameuse) )) | |
#vcfuse=glob.glob('file_to_lookat[0].name')[0] | |
#shutil.copy2(vcfuse,'input/'+vcfuse) | |
#pdb.set_trace() | |
if no_profiles_only_counts == True: | |
refgen="GRCh37" | |
project = "input" | |
project_name = project.split("/")[-1] | |
with st.spinner('computing counts only'): | |
data = datadump.SigProfilerMatrixGeneratorFunc(project_name, refgen, project, exome=False, bed_file=None, chrom_based=False, plot=False, gs=False) | |
data['96'].to_csv('sbs96.txt',sep='\t',header=False,index=True) | |
data['1536'].to_csv('sbs1536.txt',sep='\t',header=False,index=True) | |
sbs96_result,sbs1536_result=dl_counts(j) | |
#st.write(data['96']) | |
#st.write(data['1536']) | |
to_dl_sbs96.append(sbs96_result) | |
to_dl_sbs1536.append(sbs1536_result) | |
remove_old_vcf() | |
else: | |
with st.spinner('computing signatures'): | |
sig.sigProfilerExtractor("vcf", "output", "input", minimum_signatures=1, maximum_signatures=3,nmf_test_conv= 1000,nmf_tolerance= 1e-10,max_nmf_iterations=100000,min_nmf_iterations= 1000) | |
if file_to_lookat !=[] and glob.glob('output/SBS96/Suggested_Solution/COSMIC_SBS96_Decomposed_Solution/*pdf'): | |
sbs_result,indel_result,dbs_result,sbs_text,indel_text,dbs_text,summary_sbs,summary_id,summary_dbs=dl(j) | |
to_dl_sbs.append(sbs_result) | |
to_dl_sbs_text.append(sbs_text) | |
to_dl_indel.append(indel_result) | |
to_dl_indel_text.append(indel_text) | |
to_dl_dbs.append(dbs_result) | |
to_dl_dbs_text.append(dbs_text) | |
to_dl_sbs_summary_text.append(summary_sbs) | |
to_dl_id_summary_text.append(summary_id) | |
to_dl_dbs_summary_text.append(summary_dbs) | |
#show_pdf('output/SBS96/Suggested_Solution/COSMIC_SBS96_Decomposed_Solution/SBS96_Decomposition_Plots.pdf') | |
remove_old_vcf() | |
if no_profiles_only_counts == True: | |
showdl_counts(file_to_lookat,to_dl_sbs96,to_dl_sbs1536) | |
else: | |
showdl(file_to_lookat,to_dl_sbs,to_dl_indel,to_dl_dbs,to_dl_sbs_text,to_dl_indel_text,to_dl_dbs_text,to_dl_sbs_summary_text,to_dl_id_summary_text,to_dl_dbs_summary_text) | |
components.iframe("https://cancer.sanger.ac.uk/signatures/sbs/", height=3000,width=800) | |
#show_pdf('output/ID83/Suggested_Solution/COSMIC_ID83_Decomposed_Solution/ID83_Decomposition_Plots.pdf') | |
#components.iframe("https://cancer.sanger.ac.uk/signatures/id/",height=1000,width=800) |