|
|
|
|
|
|
|
|
|
|
|
|
|
import subprocess |
|
from datasets import load_dataset |
|
from multiprocessing import cpu_count |
|
from sentence_transformers import SentenceTransformer |
|
import torch |
|
import pandas as pd |
|
from huggingface_hub import snapshot_download |
|
import os |
|
from tqdm import tqdm |
|
tqdm.pandas() |
|
from mixedbread_ai.client import MixedbreadAI |
|
from dotenv import dotenv_values |
|
import numpy as np |
|
from huggingface_hub import HfApi |
|
import sys |
|
import datetime |
|
from time import time, sleep |
|
from datetime import datetime |
|
|
|
|
|
start = time() |
|
|
|
|
|
|
|
|
|
|
|
year = str(2024) |
|
|
|
|
|
FORCE = True |
|
|
|
|
|
LOCAL = False |
|
|
|
|
|
UPLOAD = True |
|
|
|
|
|
BINARY = True |
|
|
|
|
|
print(f'Configuration:') |
|
print(f'Year: {year}') |
|
print(f'Force: {FORCE}') |
|
print(f'Local: {LOCAL}') |
|
print(f'Upload: {UPLOAD}') |
|
print(f'Binary: {BINARY}') |
|
|
|
|
|
|
|
|
|
model_name = "mixedbread-ai/mxbai-embed-large-v1" |
|
|
|
|
|
num_cores = cpu_count()-1 |
|
|
|
|
|
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus" |
|
|
|
|
|
config = dotenv_values(".env") |
|
|
|
def is_running_in_huggingface_space(): |
|
return "SPACE_ID" in os.environ |
|
|
|
|
|
|
|
|
|
|
|
dataset_path = 'Cornell-University/arxiv' |
|
|
|
|
|
download_folder = 'data' |
|
|
|
|
|
download_file = f'{download_folder}/arxiv-metadata-oai-snapshot.json' |
|
|
|
|
|
if not os.path.exists(download_file) or FORCE: |
|
|
|
print(f'Downloading {download_file}, if it exists it will be overwritten') |
|
print('Set FORCE to False to skip download if file already exists') |
|
|
|
subprocess.run(['kaggle', 'datasets', 'download', '--dataset', dataset_path, '--path', download_folder, '--unzip']) |
|
|
|
print(f'Downloaded {download_file}') |
|
|
|
else: |
|
|
|
print(f'{download_file} already exists, skipping download') |
|
print('Set FORCE = True to force download') |
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Loading json metadata") |
|
dataset = load_dataset("json", data_files= str(f"{download_file}")) |
|
|
|
|
|
|
|
print(f"Converting metadata into pandas") |
|
arxiv_metadata_all = dataset['train'].to_pandas() |
|
|
|
|
|
|
|
|
|
|
|
def extract_month_year(arxiv_id, what='month'): |
|
|
|
yymm = arxiv_id.split('/')[-1][:4] if '/' in arxiv_id else arxiv_id.split('.')[0] |
|
|
|
|
|
date = datetime.strptime(yymm, '%y%m') |
|
|
|
|
|
return date.strftime('%B') if what == 'month' else date.strftime('%Y') |
|
|
|
|
|
|
|
print(f"Adding year to metadata") |
|
arxiv_metadata_all['year'] = arxiv_metadata_all['id'].progress_apply(extract_month_year, what='year') |
|
|
|
|
|
print(f"Filtering metadata by year: {year}") |
|
arxiv_metadata_split = arxiv_metadata_all[arxiv_metadata_all['year'] == year] |
|
|
|
|
|
|
|
|
|
if LOCAL: |
|
|
|
print(f"Setting up local embedding model") |
|
print("To use mxbai API, set LOCAL = False") |
|
|
|
|
|
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
|
|
|
|
|
print(f"Loading model {model_name} to device: {device}") |
|
model = SentenceTransformer(model_name) |
|
model = model.to(device) |
|
else: |
|
print("Setting up mxbai API client") |
|
print("To use local resources, set LOCAL = True") |
|
|
|
|
|
if is_running_in_huggingface_space(): |
|
mxbai_api_key = os.getenv("MXBAI_API_KEY") |
|
else: |
|
mxbai_api_key = config["MXBAI_API_KEY"] |
|
|
|
mxbai = MixedbreadAI(api_key=mxbai_api_key) |
|
|
|
|
|
|
|
def embed(input_text): |
|
|
|
if LOCAL: |
|
|
|
|
|
embedding = model.encode(input_text, device=device, precision="float32") |
|
|
|
|
|
embedding = np.array(embedding, dtype=np.float32) |
|
|
|
else: |
|
|
|
|
|
sleep(0.2) |
|
|
|
|
|
result = mxbai.embeddings( |
|
model='mixedbread-ai/mxbai-embed-large-v1', |
|
input=input_text, |
|
normalized=True, |
|
encoding_format='float', |
|
truncation_strategy='end' |
|
) |
|
|
|
|
|
embedding = np.array(result.data[0].embedding, dtype=np.float32) |
|
|
|
return embedding |
|
|
|
|
|
|
|
|
|
|
|
|
|
folder_in_repo = "data" |
|
allow_patterns = f"{folder_in_repo}/{year}.parquet" |
|
|
|
|
|
local_dir = repo_id |
|
|
|
|
|
repo_type = "dataset" |
|
|
|
|
|
os.makedirs(local_dir, exist_ok=True) |
|
|
|
|
|
snapshot_download(repo_id=repo_id, repo_type=repo_type, local_dir=local_dir, allow_patterns=allow_patterns) |
|
|
|
try: |
|
|
|
|
|
previous_embed = f'{local_dir}/{folder_in_repo}/{year}.parquet' |
|
|
|
|
|
print(f"Loading previously embedded file: {previous_embed}") |
|
previous_embeddings = pd.read_parquet(previous_embed) |
|
|
|
except Exception as e: |
|
print(f"Errored out with: {e}") |
|
print(f"No previous embeddings found for year: {year}") |
|
print("Creating new embeddings for all papers") |
|
previous_embeddings = pd.DataFrame(columns=['id', 'vector', 'title', 'abstract', 'authors', 'categories', 'month', 'year', 'url']) |
|
|
|
|
|
|
|
|
|
|
|
new_papers = arxiv_metadata_split[~arxiv_metadata_split['id'].isin(previous_embeddings['id'])] |
|
|
|
|
|
new_papers = new_papers.drop_duplicates(subset='id', keep='last', ignore_index=True) |
|
|
|
|
|
num_new_papers = len(new_papers) |
|
|
|
|
|
if num_new_papers == 0: |
|
print(f"No new papers found for year: {year}") |
|
print("Exiting") |
|
sys.exit() |
|
|
|
|
|
print(f"Creating new embeddings for: {num_new_papers} entries") |
|
new_papers["vector"] = new_papers["abstract"].progress_apply(embed) |
|
|
|
|
|
print("Adding url and month columns") |
|
|
|
|
|
new_papers['url'] = 'https://arxiv.org/abs/' + new_papers['id'] |
|
|
|
|
|
new_papers['month'] = new_papers['id'].progress_apply(extract_month_year, what='month') |
|
|
|
|
|
print("Removing newline characters from title, authors, categories, abstract") |
|
|
|
|
|
new_papers['title'] = new_papers['title'].astype(str).str.replace('\n', ' ', regex=False) |
|
|
|
new_papers['authors'] = new_papers['authors'].astype(str).str.replace('\n', ' ', regex=False) |
|
|
|
new_papers['categories'] = new_papers['categories'].astype(str).str.replace('\n', ' ', regex=False) |
|
|
|
new_papers['abstract'] = new_papers['abstract'].astype(str).str.replace('\n', ' ', regex=False) |
|
|
|
|
|
print("Trimming title, authors, categories, abstract") |
|
|
|
|
|
new_papers['title'] = new_papers['title'].progress_apply(lambda x: x[:508] + '...' if len(x) > 512 else x) |
|
|
|
|
|
new_papers['categories'] = new_papers['categories'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x) |
|
|
|
|
|
new_papers['authors'] = new_papers['authors'].progress_apply(lambda x: x[:124] + '...' if len(x) > 128 else x) |
|
|
|
|
|
new_papers['abstract'] = new_papers['abstract'].progress_apply(lambda x: x[:3068] + '...' if len(x) > 3072 else x) |
|
|
|
|
|
print("Concatenating previouly embedded dataframe with new embeddings") |
|
|
|
|
|
selected_columns = ['id', 'vector', 'title', 'abstract', 'authors', 'categories', 'month', 'year', 'url'] |
|
|
|
|
|
new_embeddings = pd.concat([previous_embeddings, new_papers[selected_columns]]) |
|
|
|
|
|
embed_folder = f"{year}-diff-embed" |
|
os.makedirs(embed_folder, exist_ok=True) |
|
|
|
|
|
embed_filename = f'{embed_folder}/{year}.parquet' |
|
print(f"Saving newly embedded dataframe to: {embed_filename}") |
|
|
|
|
|
new_embeddings.to_parquet(embed_filename, index=False) |
|
|
|
|
|
|
|
|
|
if UPLOAD: |
|
|
|
print(f"Uploading new embeddings to: {repo_id}") |
|
|
|
|
|
if is_running_in_huggingface_space(): |
|
access_token = os.getenv("HF_API_KEY") |
|
else: |
|
access_token = config["HF_API_KEY"] |
|
|
|
api = HfApi(token=access_token) |
|
|
|
|
|
api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset") |
|
|
|
print(f"Upload complete for year: {year}") |
|
|
|
else: |
|
print("Not uploading new embeddings to the repo") |
|
print("To upload new embeddings, set UPLOAD to True") |
|
|
|
|
|
|
|
if BINARY: |
|
|
|
print(f"Binarising the data for year: {year}") |
|
print("Set BINARY = False to not binarise the embeddings") |
|
|
|
|
|
def dense_to_binary(dense_vector): |
|
return np.packbits(np.where(dense_vector >= 0, 1, 0)).tobytes() |
|
|
|
|
|
binary_folder = f"{year}-binary-embed" |
|
os.makedirs(binary_folder, exist_ok=True) |
|
|
|
|
|
new_embeddings['vector'] = new_embeddings['vector'].progress_apply(dense_to_binary) |
|
|
|
|
|
new_embeddings.to_parquet(f'{binary_folder}/{year}.parquet', index=False) |
|
|
|
if BINARY and UPLOAD: |
|
|
|
|
|
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary" |
|
repo_type = "dataset" |
|
|
|
api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True) |
|
|
|
|
|
folder_in_repo = "data" |
|
|
|
print(f"Uploading binary embeddings to {repo_id} from folder {binary_folder}") |
|
|
|
|
|
api.upload_folder(repo_id=repo_id, folder_path=binary_folder, path_in_repo=folder_in_repo, repo_type=repo_type) |
|
|
|
print("Upload complete") |
|
|
|
else: |
|
print("Not uploading Binary embeddings to the repo") |
|
print("To upload embeddings, set UPLOAD and BINARY both to True") |
|
|
|
|
|
|
|
|
|
end = time() |
|
|
|
|
|
print(f"Time taken: {end - start} seconds") |
|
|
|
print("Done!") |