Spaces:
Paused
Paused
import pandas as pd | |
import numpy as np | |
from glob import glob | |
import os | |
from tqdm import tqdm | |
from huggingface_hub import snapshot_download # Download previous embeddings | |
from huggingface_hub import HfApi # To transact with huggingface.co | |
import gradio as gr # Create a Gradio interface so spaces doesnt timeout | |
tqdm.pandas() | |
####################################################################################### | |
print("Downloading repo") | |
# Setup transaction details | |
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus" | |
repo_type = "dataset" | |
# Subfolder in the repo of the dataset where the file is stored | |
local_dir = "." | |
snapshot_download(repo_id=repo_id, repo_type=repo_type, local_dir=local_dir) | |
# ####################################################################################### | |
# # Function to convert dense vector to binary vector | |
# def dense_to_binary(dense_vector): | |
# return np.packbits(np.where(dense_vector >= 0, 1, 0)).tobytes() | |
# # Gather fp32 files | |
# floats = glob('data/*.parquet') | |
# # Create a folder to store binary embeddings | |
# os.makedirs('binary_embeddings', exist_ok=True) | |
# # Convert and save each file | |
# for file in floats: | |
# print(f"Processing file: {file}") | |
# df = pd.read_parquet(file) | |
# df['vector'] = df['vector'].progress_apply(dense_to_binary) | |
# df.to_parquet(f'binary_embeddings/{os.path.basename(file)}') | |
# ####################################################################################### | |
# # Upload the new embeddings to the repo | |
# access_token = os.getenv("HF_API_KEY") | |
# api = HfApi(token=access_token) | |
# # Setup transaction details | |
# repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary" | |
# repo_type = "dataset" | |
# api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True) | |
# # Subfolder in the repo of the dataset where the file is stored | |
# folder_in_repo = "data" | |
# # Path to the folder containing the new embeddings | |
# embed_folder = "binary_embeddings" | |
# print(f"Uploading embeddings to {repo_id} from folder {embed_folder}") | |
# # Upload all files within the folder to the specified repository | |
# api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset") | |
# print("Upload complete") | |
####################################################################################### | |
# Function to convert dense vector to binary vector | |
def dense_to_bmrl(dense_vector, size=512): | |
return np.packbits(np.where(dense_vector >= 0, 1, 0)[:size]).tobytes() | |
# Gather fp32 files | |
floats = glob('data/*.parquet') | |
# Create a folder to store binary embeddings | |
os.makedirs('bmrl_embeddings', exist_ok=True) | |
# Convert and save each file | |
for file in floats: | |
print(f"Processing file: {file}") | |
df = pd.read_parquet(file) | |
df['vector'] = df['vector'].progress_apply(dense_to_bmrl) | |
df.to_parquet(f'bmrl_embeddings/{os.path.basename(file)}') | |
####################################################################################### | |
# Upload the new embeddings to the repo | |
access_token = os.getenv("HF_API_KEY") | |
api = HfApi(token=access_token) | |
# Setup transaction details | |
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_bmrl" | |
repo_type = "dataset" | |
api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True) | |
# Subfolder in the repo of the dataset where the file is stored | |
folder_in_repo = "data" | |
# Path to the folder containing the new embeddings | |
embed_folder = "bmrl_embeddings" | |
print(f"Uploading embeddings to {repo_id} from folder {embed_folder}") | |
# Upload all files within the folder to the specified repository | |
api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset") | |
print("Upload complete") | |
def greet(name, intensity): | |
return "Hello, " + name + "!" * int(intensity) | |
demo = gr.Interface( | |
fn=greet, | |
inputs=["text", "slider"], | |
outputs=["text"], | |
) | |
demo.launch() | |