Spaces:

yosuke123456
/

chatappdemo01

Runtime error

File size: 5,600 Bytes

# https://qiita.com/nekoniii3/items/5acf764af65212d9f04f

import gradio as gr

import os

from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
# from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings


os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["OPENAI_API_KEY"] = "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"

file_name1 = 'ALV2_ALV3DTU操作マニュアルDTU-V3SET01.pdf'
file_name2 = 'ALV3PCサーバ_ソフトウェア操作マニュアル_画像ファイル名付.pdf'
file_name3 = '美和ロック総合カタログ第31版_前半.pdf'
file_name4 = '美和ロック総合カタログ第31版_後半.pdf'

loader1 = PyMuPDFLoader(file_name1)
loader2 = PyMuPDFLoader(file_name2)
loader3 = PyMuPDFLoader(file_name3)
loader4 = PyMuPDFLoader(file_name4)

documents1 = loader1.load()
documents2 = loader2.load()
documents3 = loader3.load()
documents4 = loader4.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

texts1 = text_splitter.split_documents(documents1)
texts2 = text_splitter.split_documents(documents2)
texts3 = text_splitter.split_documents(documents3)
texts4 = text_splitter.split_documents(documents4)
texts = texts1 + texts2 + texts3 + texts4

# embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") 
embeddings = HuggingFaceEmbeddings(model_name="oshizo/sbert-jsnli-luke-japanese-base-lite")
vectordb = Chroma.from_documents(texts, embeddings)
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.05)

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=vectordb.as_retriever(),
    return_source_documents=True)

import shutil
def save_image_filepath(filepath: str):
    print(filepath)
    # イメージを保存
    _, file_extension = os.path.splitext(filepath)
    shutil.copy(filepath, './filepath{}'.format(file_extension))
    pass

import boto3
s3 = boto3.client('s3',
        aws_access_key_id="AKIA6ENMUHYQ7KWAEV7Q",
        aws_secret_access_key="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",
        region_name='ap-northeast-1'
)


# 画像のURL出力機能
def get_public_url(bucket, target_object_path):
    """
    対象のS3ファイルのURLを取得する

    Parameters
    ----------
    bucket: string
        S3のバケット名
    target_object_path: string
        取得したいS3内のファイルパス

    Returns
    ----------
    url: string
        S3上のオブジェクトのURL
    """
    bucket_location = s3.get_bucket_location(Bucket=bucket)
    return "https://s3-{0}.amazonaws.com/{1}/{2}".format(
        bucket_location['LocationConstraint'],
        bucket,
        target_object_path)

import fitz
doc1 = fitz.open(file_name1)
doc2 = fitz.open(file_name2)

import math

with gr.Blocks() as demo:
    chatbot = gr.Chatbot()

    msg = gr.Textbox()

    def user(user_message, history):
        reply2 = qa(user_message)
        reply=reply2['result']

        for sd in reply2["source_documents"]:
            # page_content = str(sd.page_content)
            source = str(sd.metadata["source"])
            page = sd.metadata["page"]+1
            page_num = str(page).zfill(3)
            # print("PDF：" + source)
            # print("ページ：" + page_num)
        
            if source == file_name1:
                # ページ画像のURLを取得
                bucket='page.dtu.manual'
                key='page'+page_num+'_raster.png'
                url = get_public_url(bucket, key)
                reply = reply + ' <a href='+url+'>'+page_num+'</a>'

            elif source == file_name2:
                # ページ画像のURLを取得
                bucket='page.server.manual'
                key='page'+page_num+'_raster.png'
                url = get_public_url(bucket, key)
                reply = reply + ' <a href='+url+'>'+page_num+'</a>'

                # PDFに貼り付けある画像のURLを取得
                bucket='image.server.manual'
                page2 = doc2[page]
                page_annotations = page2.annots()
                for annotation in page_annotations:
                    annotation_num = str(annotation).zfill(3)
                    # 注釈のプロパティを取得
                    key = annotation.info.get('content', '')  # ノート注釈のテキストを取得
                    url = get_public_url(bucket, key)
                    reply = reply + ' <a href='+url+'>'+key+'</a>'
            elif source == file_name3:
                page2 = str(math.floor(1+float(page_num)/2))
                url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW"
                reply = reply + ' <a href="'+url+'">'+page2+'</a>'
            elif source == file_name4:
                page2 = str(math.floor(1+(486+float(page_num))/2))
                url = "https://dcs.mediapress-net.com/iportal/cv.do?c=20958580000&pg="+page2+"&v=MIW10001&d=LINK_MIW"
                reply = reply + ' <a href="'+url+'">'+page2+'</a>'
            else:
                exit(0)
                    
        return "", history + [[user_message, reply]]

    def bot(history):
        yield history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then(
        bot, chatbot, chatbot
    )
    
demo.queue()
demo.launch(share=True)