Spaces:
Runtime error
Runtime error
File size: 5,600 Bytes
8fc89a7 5b175d6 8fc89a7 cf7e2b9 8fc89a7 5b175d6 8fc89a7 5b175d6 8fc89a7 5b175d6 8fc89a7 5b175d6 8fc89a7 5b175d6 8fc89a7 55caba7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
# https://qiita.com/nekoniii3/items/5acf764af65212d9f04f
import gradio as gr
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
# from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["OPENAI_API_KEY"] = "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"
file_name1 = 'ALV2_ALV3DTUๆไฝใใใฅใขใซDTU-V3SET01.pdf'
file_name2 = 'ALV3PCใตใผใ_ใฝใใใฆใงใขๆไฝใใใฅใขใซ_็ปๅใใกใคใซๅไป.pdf'
file_name3 = '็พๅใญใใฏ็ทๅใซใฟใญใฐ็ฌฌ31็_ๅๅ.pdf'
file_name4 = '็พๅใญใใฏ็ทๅใซใฟใญใฐ็ฌฌ31็_ๅพๅ.pdf'
loader1 = PyMuPDFLoader(file_name1)
loader2 = PyMuPDFLoader(file_name2)
loader3 = PyMuPDFLoader(file_name3)
loader4 = PyMuPDFLoader(file_name4)
documents1 = loader1.load()
documents2 = loader2.load()
documents3 = loader3.load()
documents4 = loader4.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts1 = text_splitter.split_documents(documents1)
texts2 = text_splitter.split_documents(documents2)
texts3 = text_splitter.split_documents(documents3)
texts4 = text_splitter.split_documents(documents4)
texts = texts1 + texts2 + texts3 + texts4
# embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
embeddings = HuggingFaceEmbeddings(model_name="oshizo/sbert-jsnli-luke-japanese-base-lite")
vectordb = Chroma.from_documents(texts, embeddings)
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.05)
qa = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=vectordb.as_retriever(),
return_source_documents=True)
import shutil
def save_image_filepath(filepath: str):
print(filepath)
# ใคใกใผใธใไฟๅญ
_, file_extension = os.path.splitext(filepath)
shutil.copy(filepath, './filepath{}'.format(file_extension))
pass
import boto3
s3 = boto3.client('s3',
aws_access_key_id="AKIA6ENMUHYQ7KWAEV7Q",
aws_secret_access_key="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",
region_name='ap-northeast-1'
)
# ็ปๅใฎURLๅบๅๆฉ่ฝ
def get_public_url(bucket, target_object_path):
"""
ๅฏพ่ฑกใฎS3ใใกใคใซใฎURLใๅๅพใใ
Parameters
----------
bucket: string
S3ใฎใใฑใใๅ
target_object_path: string
ๅๅพใใใS3ๅ
ใฎใใกใคใซใใน
Returns
----------
url: string
S3ไธใฎใชใใธใงใฏใใฎURL
"""
bucket_location = s3.get_bucket_location(Bucket=bucket)
return "https://s3-{0}.amazonaws.com/{1}/{2}".format(
bucket_location['LocationConstraint'],
bucket,
target_object_path)
import fitz
doc1 = fitz.open(file_name1)
doc2 = fitz.open(file_name2)
import math
with gr.Blocks() as demo:
chatbot = gr.Chatbot()
msg = gr.Textbox()
def user(user_message, history):
reply2 = qa(user_message)
reply=reply2['result']
for sd in reply2["source_documents"]:
# page_content = str(sd.page_content)
source = str(sd.metadata["source"])
page = sd.metadata["page"]+1
page_num = str(page).zfill(3)
# print("PDF๏ผ" + source)
# print("ใใผใธ๏ผ" + page_num)
if source == file_name1:
# ใใผใธ็ปๅใฎURLใๅๅพ
bucket='page.dtu.manual'
key='page'+page_num+'_raster.png'
url = get_public_url(bucket, key)
reply = reply + ' <a href='+url+'>'+page_num+'</a>'
elif source == file_name2:
# ใใผใธ็ปๅใฎURLใๅๅพ
bucket='page.server.manual'
key='page'+page_num+'_raster.png'
url = get_public_url(bucket, key)
reply = reply + ' <a href='+url+'>'+page_num+'</a>'
# PDFใซ่ฒผใไปใใใ็ปๅใฎURLใๅๅพ
bucket='image.server.manual'
page2 = doc2[page]
page_annotations = page2.annots()
for annotation in page_annotations:
annotation_num = str(annotation).zfill(3)
# ๆณจ้ใฎใใญใใใฃใๅๅพ
key = annotation.info.get('content', '') # ใใผใๆณจ้ใฎใใญในใใๅๅพ
url = get_public_url(bucket, key)
reply = reply + ' <a href='+url+'>'+key+'</a>'
elif source == file_name3:
page2 = str(math.floor(1+float(page_num)/2))
url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW"
reply = reply + ' <a href="'+url+'">'+page2+'</a>'
elif source == file_name4:
page2 = str(math.floor(1+(486+float(page_num))/2))
url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW"
reply = reply + ' <a href="'+url+'">'+page2+'</a>'
else:
exit(0)
return "", history + [[user_message, reply]]
def bot(history):
yield history
msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then(
bot, chatbot, chatbot
)
demo.queue()
demo.launch(share=True)
|