Zeeshan42 commited on
Commit
41ac1e9
·
verified ·
1 Parent(s): b2a59e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -15
app.py CHANGED
@@ -1,5 +1,5 @@
1
  from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
2
- from datasets import Dataset
3
  from groq import Groq
4
  import os
5
 
@@ -8,27 +8,31 @@ client = Groq(api_key="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"
8
 
9
  # Book names (replace with your uploaded book names on Hugging Face)
10
  book_names = {
11
- "DSM": "Diagnostic_and_statistical_manual_of_mental_disorders_DSM5.pdf",
12
- "Personality": "Theories_of_Personality_10.pdf",
13
- "SearchForMeaning": "Mans_Search_For_Meaning.pdf"
14
  }
15
 
16
  # Function to load and preprocess the data from books (now using Hugging Face datasets)
17
  def load_data(book_names):
18
  data = []
19
  for title, book_name in book_names.items():
20
- # Assuming books are stored in Hugging Face datasets or other accessible locations
21
- # Here you will load the dataset from Hugging Face directly if it's uploaded
22
- # For example, use `datasets.load_dataset` to load the books if they are uploaded
23
 
24
- # Example (replace with actual loading mechanism based on how the books are stored on Hugging Face):
25
- dataset = Dataset.from_huggingface_dataset(book_name)
26
-
27
- # Assuming the dataset contains the text, we split it by paragraphs
28
- paragraphs = dataset["text"].split("\n\n") # Adjust based on actual dataset structure
29
- for paragraph in paragraphs:
30
- if paragraph.strip(): # Skip empty paragraphs
31
- data.append({"text": paragraph.strip()})
 
 
 
 
32
  return Dataset.from_list(data)
33
 
34
  # Load and preprocess dataset for fine-tuning
 
1
  from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
2
+ from datasets import load_dataset, Dataset
3
  from groq import Groq
4
  import os
5
 
 
8
 
9
  # Book names (replace with your uploaded book names on Hugging Face)
10
  book_names = {
11
+ "DSM": "Diagnostic_and_statistical_manual_of_mental_disorders_DSM5",
12
+ "Personality": "Theories_of_Personality_10",
13
+ "SearchForMeaning": "Mans_Search_For_Meaning"
14
  }
15
 
16
  # Function to load and preprocess the data from books (now using Hugging Face datasets)
17
  def load_data(book_names):
18
  data = []
19
  for title, book_name in book_names.items():
20
+ # Load dataset from Hugging Face using the book name
21
+ # The dataset should be in the form of a text dataset or you should have pre-uploaded datasets
22
+ # Example: Assuming the datasets are pre-uploaded on Hugging Face and stored as text files
23
 
24
+ try:
25
+ dataset = load_dataset(book_name) # Try to load dataset by name
26
+ text = dataset['train']['text'] # Adjust depending on dataset structure
27
+ paragraphs = text.split("\n\n") # Split by paragraphs
28
+
29
+ for paragraph in paragraphs:
30
+ if paragraph.strip(): # Skip empty paragraphs
31
+ data.append({"text": paragraph.strip()})
32
+ except Exception as e:
33
+ print(f"Error loading dataset for {book_name}: {e}")
34
+ continue
35
+
36
  return Dataset.from_list(data)
37
 
38
  # Load and preprocess dataset for fine-tuning