Zeeshan42 commited on
Commit
b2a59e1
·
verified ·
1 Parent(s): 23fe81f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -15
app.py CHANGED
@@ -6,27 +6,33 @@ import os
6
  # Initialize Groq client with your API key
7
  client = Groq(api_key="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")
8
 
9
- # Paths to your books
10
- book_paths = {
11
- "DSM": "/app/Diagnostic and statistical manual of mental disorders _ DSM-5 ( PDFDrive.com ).pdf",
12
- "Personality": "/b6c3v8_Theories_of_Personality_10.pdf",
13
- "SearchForMeaning": "/Mans-Search-For-Meaning.pdf"
14
  }
15
 
16
- # Function to load and preprocess the data from books
17
- def load_data(paths):
18
  data = []
19
- for title, path in paths.items():
20
- with open(path, "r", encoding="utf-8", errors='ignore') as file:
21
- text = file.read()
22
- paragraphs = text.split("\n\n") # Split by paragraphs (adjust as needed)
23
- for paragraph in paragraphs:
24
- if paragraph.strip(): # Skip empty paragraphs
25
- data.append({"text": paragraph.strip()})
 
 
 
 
 
 
26
  return Dataset.from_list(data)
27
 
28
  # Load and preprocess dataset for fine-tuning
29
- dataset = load_data(book_paths)
30
 
31
  # Load pretrained model and tokenizer from Hugging Face
32
  model_name = "gpt2" # Replace with a larger model if needed and feasible
 
6
  # Initialize Groq client with your API key
7
  client = Groq(api_key="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")
8
 
9
+ # Book names (replace with your uploaded book names on Hugging Face)
10
+ book_names = {
11
+ "DSM": "Diagnostic_and_statistical_manual_of_mental_disorders_DSM5.pdf",
12
+ "Personality": "Theories_of_Personality_10.pdf",
13
+ "SearchForMeaning": "Mans_Search_For_Meaning.pdf"
14
  }
15
 
16
+ # Function to load and preprocess the data from books (now using Hugging Face datasets)
17
+ def load_data(book_names):
18
  data = []
19
+ for title, book_name in book_names.items():
20
+ # Assuming books are stored in Hugging Face datasets or other accessible locations
21
+ # Here you will load the dataset from Hugging Face directly if it's uploaded
22
+ # For example, use `datasets.load_dataset` to load the books if they are uploaded
23
+
24
+ # Example (replace with actual loading mechanism based on how the books are stored on Hugging Face):
25
+ dataset = Dataset.from_huggingface_dataset(book_name)
26
+
27
+ # Assuming the dataset contains the text, we split it by paragraphs
28
+ paragraphs = dataset["text"].split("\n\n") # Adjust based on actual dataset structure
29
+ for paragraph in paragraphs:
30
+ if paragraph.strip(): # Skip empty paragraphs
31
+ data.append({"text": paragraph.strip()})
32
  return Dataset.from_list(data)
33
 
34
  # Load and preprocess dataset for fine-tuning
35
+ dataset = load_data(book_names)
36
 
37
  # Load pretrained model and tokenizer from Hugging Face
38
  model_name = "gpt2" # Replace with a larger model if needed and feasible