bmrl / app.py
bluuebunny's picture
Update app.py
44aac25 verified
import pandas as pd
import numpy as np
from glob import glob
import os
from tqdm import tqdm
from huggingface_hub import snapshot_download # Download previous embeddings
from huggingface_hub import HfApi # To transact with huggingface.co
import gradio as gr # Create a Gradio interface so spaces doesnt timeout
tqdm.pandas()
#######################################################################################
print("Downloading repo")
# Setup transaction details
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus"
repo_type = "dataset"
# Subfolder in the repo of the dataset where the file is stored
local_dir = "."
snapshot_download(repo_id=repo_id, repo_type=repo_type, local_dir=local_dir)
# #######################################################################################
# # Function to convert dense vector to binary vector
# def dense_to_binary(dense_vector):
# return np.packbits(np.where(dense_vector >= 0, 1, 0)).tobytes()
# # Gather fp32 files
# floats = glob('data/*.parquet')
# # Create a folder to store binary embeddings
# os.makedirs('binary_embeddings', exist_ok=True)
# # Convert and save each file
# for file in floats:
# print(f"Processing file: {file}")
# df = pd.read_parquet(file)
# df['vector'] = df['vector'].progress_apply(dense_to_binary)
# df.to_parquet(f'binary_embeddings/{os.path.basename(file)}')
# #######################################################################################
# # Upload the new embeddings to the repo
# access_token = os.getenv("HF_API_KEY")
# api = HfApi(token=access_token)
# # Setup transaction details
# repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary"
# repo_type = "dataset"
# api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
# # Subfolder in the repo of the dataset where the file is stored
# folder_in_repo = "data"
# # Path to the folder containing the new embeddings
# embed_folder = "binary_embeddings"
# print(f"Uploading embeddings to {repo_id} from folder {embed_folder}")
# # Upload all files within the folder to the specified repository
# api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset")
# print("Upload complete")
#######################################################################################
# Function to convert dense vector to binary vector
def dense_to_bmrl(dense_vector, size=512):
return np.packbits(np.where(dense_vector >= 0, 1, 0)[:size]).tobytes()
# Gather fp32 files
floats = glob('data/*.parquet')
# Create a folder to store binary embeddings
os.makedirs('bmrl_embeddings', exist_ok=True)
# Convert and save each file
for file in floats:
print(f"Processing file: {file}")
df = pd.read_parquet(file)
df['vector'] = df['vector'].progress_apply(dense_to_bmrl)
df.to_parquet(f'bmrl_embeddings/{os.path.basename(file)}')
#######################################################################################
# Upload the new embeddings to the repo
access_token = os.getenv("HF_API_KEY")
api = HfApi(token=access_token)
# Setup transaction details
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_bmrl"
repo_type = "dataset"
api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
# Subfolder in the repo of the dataset where the file is stored
folder_in_repo = "data"
# Path to the folder containing the new embeddings
embed_folder = "bmrl_embeddings"
print(f"Uploading embeddings to {repo_id} from folder {embed_folder}")
# Upload all files within the folder to the specified repository
api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset")
print("Upload complete")
def greet(name, intensity):
return "Hello, " + name + "!" * int(intensity)
demo = gr.Interface(
fn=greet,
inputs=["text", "slider"],
outputs=["text"],
)
demo.launch()