sdiazlor HF staff commited on
Commit
b844fe3
·
1 Parent(s): c0bdbeb

update example and solve installation issues for Spaces

Browse files
examples/fine-tune-modernbert-rag.ipynb CHANGED
@@ -8,7 +8,7 @@
8
  "\n",
9
  "This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
10
  "\n",
11
- "It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT with Synthetic Data for RAG](https://huggingface.co/blog/fine-tune-modernbert-with-synthetic-data-for-rag)."
12
  ]
13
  },
14
  {
@@ -47,7 +47,7 @@
47
  },
48
  {
49
  "cell_type": "code",
50
- "execution_count": null,
51
  "metadata": {},
52
  "outputs": [],
53
  "source": [
@@ -97,7 +97,7 @@
97
  },
98
  {
99
  "cell_type": "code",
100
- "execution_count": null,
101
  "metadata": {},
102
  "outputs": [],
103
  "source": [
@@ -140,7 +140,7 @@
140
  },
141
  {
142
  "cell_type": "code",
143
- "execution_count": null,
144
  "metadata": {},
145
  "outputs": [
146
  {
@@ -152,7 +152,7 @@
152
  "})"
153
  ]
154
  },
155
- "execution_count": 2,
156
  "metadata": {},
157
  "output_type": "execute_result"
158
  }
@@ -531,7 +531,7 @@
531
  "# Remember to adjust the training arguments according to your requirements\n",
532
  "\n",
533
  "trainer = SentenceTransformerTrainer(\n",
534
- " model=model,\n",
535
  " args=training_args,\n",
536
  " train_dataset=dataset_rag_biencoder[\"train\"],\n",
537
  " eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
@@ -701,7 +701,7 @@
701
  },
702
  {
703
  "cell_type": "code",
704
- "execution_count": null,
705
  "metadata": {},
706
  "outputs": [],
707
  "source": [
@@ -710,7 +710,7 @@
710
  "\n",
711
  "df = combined_rag_dataset.to_pandas()\n",
712
  "df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
713
- "# df = df.sample(n=100, random_state=42) # optional: sample a subset of the dataset\n",
714
  "dataset = Dataset.from_pandas(df)\n",
715
  "\n",
716
  "docs = [Document(content=doc[\"context\"]) for doc in dataset]"
 
8
  "\n",
9
  "This notebook demonstrates the fine-tuning process of `modernbert-embed-base` using synthetic data tailored for the Retrieval-Augmented Generation (RAG) model.\n",
10
  "\n",
11
+ "It provides a complete walkthrough of the fine-tuning process after generating synthetic data using the Synthetic Data Generator. For a comprehensive explanation of the methodology and additional details, refer to the blog post: [Fine-tune ModernBERT for RAG with Synthetic Data](https://huggingface.co/blog/fine-tune-modernbert-for-rag-with-synthetic-data)."
12
  ]
13
  },
14
  {
 
47
  },
48
  {
49
  "cell_type": "code",
50
+ "execution_count": 1,
51
  "metadata": {},
52
  "outputs": [],
53
  "source": [
 
97
  },
98
  {
99
  "cell_type": "code",
100
+ "execution_count": 2,
101
  "metadata": {},
102
  "outputs": [],
103
  "source": [
 
140
  },
141
  {
142
  "cell_type": "code",
143
+ "execution_count": 3,
144
  "metadata": {},
145
  "outputs": [
146
  {
 
152
  "})"
153
  ]
154
  },
155
+ "execution_count": 3,
156
  "metadata": {},
157
  "output_type": "execute_result"
158
  }
 
531
  "# Remember to adjust the training arguments according to your requirements\n",
532
  "\n",
533
  "trainer = SentenceTransformerTrainer(\n",
534
+ " model=model_biencoder,\n",
535
  " args=training_args,\n",
536
  " train_dataset=dataset_rag_biencoder[\"train\"],\n",
537
  " eval_dataset=dataset_rag_biencoder[\"eval\"],\n",
 
701
  },
702
  {
703
  "cell_type": "code",
704
+ "execution_count": 4,
705
  "metadata": {},
706
  "outputs": [],
707
  "source": [
 
710
  "\n",
711
  "df = combined_rag_dataset.to_pandas()\n",
712
  "df = df.drop_duplicates(subset=[\"context\"]) # drop duplicates based on \"context\" column\n",
713
+ "df = df.sample(n=10, random_state=42) # optional: sample a subset of the dataset\n",
714
  "dataset = Dataset.from_pandas(df)\n",
715
  "\n",
716
  "docs = [Document(content=doc[\"context\"]) for doc in dataset]"
requirements.txt CHANGED
@@ -1 +1,2 @@
1
  -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
 
 
1
  -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator
2
+ !apt-get install -y poppler-utils
src/synthetic_dataset_generator/apps/rag.py CHANGED
@@ -5,6 +5,7 @@ from typing import Union
5
 
6
  import argilla as rg
7
  import gradio as gr
 
8
  import pandas as pd
9
  from datasets import (
10
  Dataset,
@@ -50,7 +51,7 @@ from synthetic_dataset_generator.utils import (
50
  get_random_repo_name,
51
  swap_visibility,
52
  )
53
-
54
 
55
  def _get_valid_columns(dataframe: pd.DataFrame):
56
  doc_valid_columns = []
 
5
 
6
  import argilla as rg
7
  import gradio as gr
8
+ import nltk
9
  import pandas as pd
10
  from datasets import (
11
  Dataset,
 
51
  get_random_repo_name,
52
  swap_visibility,
53
  )
54
+ nltk.download("punkt_tab")
55
 
56
  def _get_valid_columns(dataframe: pd.DataFrame):
57
  doc_valid_columns = []