Spaces:
Runtime error
Runtime error
import os | |
import json | |
import requests | |
from pymilvus import MilvusClient, DataType, Schema, Collection, utility | |
from dotenv import load_dotenv | |
load_dotenv() | |
VERTOPAL_API_KEY = os.getenv("VERTOPAL_API_KEY") | |
ZILLIZ_CLUSTER_ENDPOINT = os.getenv("ZILLIZ_CLUSTER_ENDPOINT") | |
ZILLIZ_TOKEN = os.getenv("ZILLIZ_TOKEN") | |
def convert_pdf_to_json(file_path): | |
url = "https://api.vertopal.com/v1/convert/file" | |
headers = { | |
"Authorization": f"Bearer {VERTOPAL_API_KEY}" | |
} | |
data = { | |
"app": "[APP_ID]", | |
"parameters": { | |
"output": "json" | |
} | |
} | |
files = { | |
"file": open(file_path, "rb") | |
} | |
response = requests.post(url, headers=headers, data=data, files=files) | |
response.raise_for_status() | |
json_data = response.json() | |
return json_data["result"]["output"]["connector"] | |
def download_json_file(connector): | |
url = "https://api.vertopal.com/v1/download/url/get" | |
headers = { | |
"Authorization": f"Bearer {VERTOPAL_API_KEY}" | |
} | |
data = { | |
"app": "[APP_ID]", | |
"connector": connector | |
} | |
response = requests.post(url, headers=headers, data=data) | |
response.raise_for_status() | |
json_data = response.json() | |
return json_data | |
def create_milvus_client_and_collection(collection_name): | |
client = MilvusClient(uri=ZILLIZ_CLUSTER_ENDPOINT, token=ZILLIZ_TOKEN) | |
if utility.has_collection(collection_name): | |
collection = Collection(collection_name) | |
else: | |
schema = Schema(enable_dynamic_field=True, description="") | |
schema.add_field(field_name="primary_key", datatype=DataType.INT64, description="The Primary Key", is_primary=True, auto_id=False) | |
schema.add_field(field_name="json_data", datatype=DataType.VARCHAR, description="JSON Data", max_length=65535) | |
collection = client.create_collection(collection_name, schema=schema) | |
return client, collection | |
def upload_json_to_milvus(json_data, collection_name): | |
client, collection = create_milvus_client_and_collection(collection_name) | |
data = [ | |
(len(collection), json.dumps(json_data)) | |
] | |
collection.insert(data) | |
def process_pdfs(directory): | |
file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.pdf')] | |
for file_path in file_paths: | |
print(f"Processing file: {file_path}") | |
connector = convert_pdf_to_json(file_path) | |
json_data = download_json_file(connector) | |
upload_json_to_milvus(json_data, "pdf_json_collection") | |
print(f"Uploaded JSON data for file: {file_path}") | |
def upload_persona_json(file_path): | |
with open(file_path, "r") as f: | |
persona_json = json.load(f) | |
upload_json_to_milvus(persona_json, "persona_collection") | |
print("Uploaded persona JSON to Milvus") | |
if __name__ == "__main__": | |
pdf_directory = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\ILYA\\pdfs" | |
process_pdfs(pdf_directory) | |
persona_json_path = "L:\\00.Developer Playground\\DEV\\_VS-Code\\_C3P03\\_PG\\DEV\\_HubFaceRag\\_Ilya\\ILYA\\_RAG\\_v2\\ILYA\\_docs\\_RAG\\persona.json" | |
upload_persona_json(persona_json_path) |