Spaces:
Build error
Build error
File size: 6,428 Bytes
1c744c7 d299aec 1c744c7 39526c3 1c744c7 bbc90a3 1c744c7 d299aec 1c744c7 bbc90a3 1c744c7 d299aec 1c744c7 d299aec 1c744c7 bc3d855 1c744c7 bc3d855 1c744c7 bc3d855 1c744c7 bc3d855 302ccf8 1c744c7 bc3d855 1f6598f bc3d855 1c744c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 |
import streamlit as st
from streamlit_chat import message
import os
from utils import (
parse_docx,
parse_pdf,
parse_txt,
parse_csv,
parse_pptx,
search_docs,
embed_docs,
text_to_docs,
get_answer,
parse_any,
get_sources,
wrap_text_in_html,
)
from openai.error import OpenAIError
def clear_submit():
st.session_state["submit"] = False
def set_openai_api_key(api_key: str):
st.session_state["OPENAI_API_KEY"] = api_key
st.markdown('<h1>File GPT 🤖<small> by <a href="https://codegpt.co">Code GPT</a></small></h1>', unsafe_allow_html=True)
# Sidebar
index = None
doc = None
with st.sidebar:
user_secret = st.text_input(
"OpenAI API Key",
type="password",
placeholder="Paste your OpenAI API key here (sk-...)",
help="You can get your API key from https://platform.openai.com/account/api-keys.",
value=st.session_state.get("OPENAI_API_KEY", ""),
)
if user_secret:
set_openai_api_key(user_secret)
uploaded_file = st.file_uploader(
"Upload a pdf, docx, or txt file",
type=["pdf", "docx", "txt", "csv", "pptx", "js", "py", "json", "html", "css", "md"],
help="Scanned documents are not supported yet!",
on_change=clear_submit,
)
if uploaded_file is not None:
if uploaded_file.name.endswith(".pdf"):
doc = parse_pdf(uploaded_file)
elif uploaded_file.name.endswith(".docx"):
doc = parse_docx(uploaded_file)
elif uploaded_file.name.endswith(".csv"):
doc = parse_csv(uploaded_file)
elif uploaded_file.name.endswith(".txt"):
doc = parse_txt(uploaded_file)
elif uploaded_file.name.endswith(".pptx"):
doc = parse_pptx(uploaded_file)
else:
doc = parse_any(uploaded_file)
#st.error("File type not supported")
#doc = None
text = text_to_docs(doc)
st.write(text)
try:
with st.spinner("Indexing document... This may take a while⏳"):
index = embed_docs(text)
st.session_state["api_key_configured"] = True
except OpenAIError as e:
st.error(e._message)
tab1, tab2 = st.tabs(["Intro", "Chat with the File"])
with tab1:
st.markdown("### How does it work?")
st.write("File GPT is a tool that allows you to ask questions about a document and get answers from the document. The tool uses the OpenAI API to embed the document and then uses the Embedding API to find the most similar documents to the question. The tool then uses LangChain to obtain the answer from the most similar documents.")
st.write("The tool is currently in beta and is not perfect. It is recommended to use it with short documents.")
st.write("""---""")
st.markdown("### How to use it?")
st.write("To use the tool you must first add your OpenAI API Key and then upload a document. The tool currently supports the following file types: pdf, docx, txt, csv, pptx. Once the document is uploaded, the tool will index the document and embed it. This may take a while depending on the size of the document. Once the document is indexed, you can ask questions about the document. The tool will return the answer to the question and the source of the answer.")
st.markdown('<p>Read the article to know more details: <a target="_blank" href="https://medium.com/@dan.avila7/file-gpt-conversaci%C3%B3n-por-chat-con-un-archivo-698d17570358">Medium Article (Spanish)</a></p>', unsafe_allow_html=True)
st.write("## File GPT was written with the following tools:")
st.markdown("#### Code GPT")
st.write('All code was written with the help of Code GPT. Visit https://codegpt.co to get the extension.')
st.markdown("#### Streamlit")
st.write('The design was written with <a target="_blank" href="https://streamlit.io/">Streamlit</a>.', unsafe_allow_html=True)
st.markdown("#### LangChain")
st.write('Question answering with source <a target="_blank" href="https://langchain.readthedocs.io/en/latest/use_cases/question_answering.html#adding-in-sources">Langchain QA</a>.', unsafe_allow_html=True)
st.markdown("#### Embedding")
st.write('<a target="_blank" href="https://platform.openai.com/docs/guides/embeddings">Embedding</a> is done via the OpenAI API with "text-embedding-ada-002"', unsafe_allow_html=True)
st.write("Please note that you must have credits in your OpenAI account to use this tool. Each file uploaded to the platform consumes credits for embedding and each query consumes credits to obtain the response.")
st.markdown("""---""")
st.write('Author: <a target="_blank" href="https://www.linkedin.com/in/daniel-avila-arias/">Daniel Avila</a>', unsafe_allow_html=True)
st.write('Repo: <a target="_blank" href="https://github.com/davila7/file-gpt">Github</a>', unsafe_allow_html=True)
st.write("This software was developed with Code GPT, for more information visit: https://codegpt.co", unsafe_allow_html=True)
with tab2:
st.write('To obtain an API Key you must create an OpenAI account at the following link: https://openai.com/api/')
if 'generated' not in st.session_state:
st.session_state['generated'] = []
if 'past' not in st.session_state:
st.session_state['past'] = []
def get_text():
if user_secret:
st.header("Ask me something about the document:")
input_text = st.text_area("You:", on_change=clear_submit)
return input_text
user_input = get_text()
button = st.button("Submit")
if button or st.session_state.get("submit"):
if not user_input:
st.error("Please enter a question!")
else:
st.session_state["submit"] = True
sources = search_docs(index, user_input)
try:
answer = get_answer(sources, user_input)
st.session_state.past.append(user_input)
st.session_state.generated.append(answer["output_text"].split("SOURCES: ")[0])
except OpenAIError as e:
st.error(e._message)
if st.session_state['generated']:
for i in range(len(st.session_state['generated'])-1, -1, -1):
message(st.session_state["generated"][i], key=str(i))
message(st.session_state['past'][i], is_user=True, key=str(i) + '_user') |