update example and solve installation issues for Spaces
Browse files
examples/fine-tune-modernbert-rag.ipynb
CHANGED
@@ -8,7 +8,7 @@
|
|
8 |
"\n",
|
9 |
"This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
|
10 |
"\n",
|
11 |
-
"It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT with Synthetic Data
|
12 |
]
|
13 |
},
|
14 |
{
|
@@ -47,7 +47,7 @@
|
|
47 |
},
|
48 |
{
|
49 |
"cell_type": "code",
|
50 |
-
"execution_count":
|
51 |
"metadata": {},
|
52 |
"outputs": [],
|
53 |
"source": [
|
@@ -97,7 +97,7 @@
|
|
97 |
},
|
98 |
{
|
99 |
"cell_type": "code",
|
100 |
-
"execution_count":
|
101 |
"metadata": {},
|
102 |
"outputs": [],
|
103 |
"source": [
|
@@ -140,7 +140,7 @@
|
|
140 |
},
|
141 |
{
|
142 |
"cell_type": "code",
|
143 |
-
"execution_count":
|
144 |
"metadata": {},
|
145 |
"outputs": [
|
146 |
{
|
@@ -152,7 +152,7 @@
|
|
152 |
"})"
|
153 |
]
|
154 |
},
|
155 |
-
"execution_count":
|
156 |
"metadata": {},
|
157 |
"output_type": "execute_result"
|
158 |
}
|
@@ -531,7 +531,7 @@
|
|
531 |
"# Remember to adjust the training arguments according to your requirements\n",
|
532 |
"\n",
|
533 |
"trainer = SentenceTransformerTrainer(\n",
|
534 |
-
" model=
|
535 |
" args=training_args,\n",
|
536 |
" train_dataset=dataset_rag_biencoder[\"train\"],\n",
|
537 |
" eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
|
@@ -701,7 +701,7 @@
|
|
701 |
},
|
702 |
{
|
703 |
"cell_type": "code",
|
704 |
-
"execution_count":
|
705 |
"metadata": {},
|
706 |
"outputs": [],
|
707 |
"source": [
|
@@ -710,7 +710,7 @@
|
|
710 |
"\n",
|
711 |
"df = combined_rag_dataset.to_pandas()\n",
|
712 |
"df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
|
713 |
-
"
|
714 |
"dataset = Dataset.from_pandas(df)\n",
|
715 |
"\n",
|
716 |
"docs = [Document(content=doc[\"context\"]) for doc in dataset]"
|
|
|
8 |
"\n",
|
9 |
"This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
|
10 |
"\n",
|
11 |
+
"It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT for RAG with Synthetic Data](https://huggingface.co/blog/fine-tune-modernbert-for-rag-with-synthetic-data)."
|
12 |
]
|
13 |
},
|
14 |
{
|
|
|
47 |
},
|
48 |
{
|
49 |
"cell_type": "code",
|
50 |
+
"execution_count": 1,
|
51 |
"metadata": {},
|
52 |
"outputs": [],
|
53 |
"source": [
|
|
|
97 |
},
|
98 |
{
|
99 |
"cell_type": "code",
|
100 |
+
"execution_count": 2,
|
101 |
"metadata": {},
|
102 |
"outputs": [],
|
103 |
"source": [
|
|
|
140 |
},
|
141 |
{
|
142 |
"cell_type": "code",
|
143 |
+
"execution_count": 3,
|
144 |
"metadata": {},
|
145 |
"outputs": [
|
146 |
{
|
|
|
152 |
"})"
|
153 |
]
|
154 |
},
|
155 |
+
"execution_count": 3,
|
156 |
"metadata": {},
|
157 |
"output_type": "execute_result"
|
158 |
}
|
|
|
531 |
"# Remember to adjust the training arguments according to your requirements\n",
|
532 |
"\n",
|
533 |
"trainer = SentenceTransformerTrainer(\n",
|
534 |
+
" model=model_biencoder,\n",
|
535 |
" args=training_args,\n",
|
536 |
" train_dataset=dataset_rag_biencoder[\"train\"],\n",
|
537 |
" eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
|
|
|
701 |
},
|
702 |
{
|
703 |
"cell_type": "code",
|
704 |
+
"execution_count": 4,
|
705 |
"metadata": {},
|
706 |
"outputs": [],
|
707 |
"source": [
|
|
|
710 |
"\n",
|
711 |
"df = combined_rag_dataset.to_pandas()\n",
|
712 |
"df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
|
713 |
+
"df = df.sample(n=10, random_state=42) # optional: sample a subset of the dataset\n",
|
714 |
"dataset = Dataset.from_pandas(df)\n",
|
715 |
"\n",
|
716 |
"docs = [Document(content=doc[\"context\"]) for doc in dataset]"
|
requirements.txt
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
|
|
|
|
1 |
-e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
|
2 |
+
!apt-get install -y poppler-utils
|
src/synthetic_dataset_generator/apps/rag.py
CHANGED
@@ -5,6 +5,7 @@ from typing import Union
|
|
5 |
|
6 |
import argilla as rg
|
7 |
import gradio as gr
|
|
|
8 |
import pandas as pd
|
9 |
from datasets import (
|
10 |
Dataset,
|
@@ -50,7 +51,7 @@ from synthetic_dataset_generator.utils import (
|
|
50 |
get_random_repo_name,
|
51 |
swap_visibility,
|
52 |
)
|
53 |
-
|
54 |
|
55 |
def _get_valid_columns(dataframe: pd.DataFrame):
|
56 |
doc_valid_columns = []
|
|
|
5 |
|
6 |
import argilla as rg
|
7 |
import gradio as gr
|
8 |
+
import nltk
|
9 |
import pandas as pd
|
10 |
from datasets import (
|
11 |
Dataset,
|
|
|
51 |
get_random_repo_name,
|
52 |
swap_visibility,
|
53 |
)
|
54 |
+
nltk.download("punkt_tab")
|
55 |
|
56 |
def _get_valid_columns(dataframe: pd.DataFrame):
|
57 |
doc_valid_columns = []
|