Spaces:

kritsadaK
/

US_political_fake_news_classification

Sleeping

kritsadaK commited on Sep 10, 2024

Commit

07f46a5

1 Parent(s): db87ccc

Fix punkt download and path issue

Files changed (1) hide show

app.py CHANGED Viewed

@@ -13,25 +13,26 @@ import os
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer  # Import WordNetLemmatizer
 # Set NLTK data path to the local 'nltk_data' folder in your project
 nltk_data_path = 'nltk_data'
 nltk.data.path.append(nltk_data_path)
-# Check if 'stopwords' is already downloaded, if not, download it
-if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
-    nltk.download('stopwords', download_dir=nltk_data_path)
 # Check if 'punkt' is already downloaded, if not, download it
 if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
     nltk.download('punkt', download_dir=nltk_data_path)
-# Check if 'wordnet' is already downloaded, if not, download it
-if not os.path.exists(os.path.join(nltk_data_path, 'corpora/wordnet')):
-    nltk.download('wordnet', download_dir=nltk_data_path)
-# Initialize the lemmatizer
-lemmatizer = WordNetLemmatizer()
 # 1. Remove punctuation
 def remove_punctuation(text):

 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer  # Import WordNetLemmatizer
+import nltk
+import os
 # Set NLTK data path to the local 'nltk_data' folder in your project
 nltk_data_path = 'nltk_data'
 nltk.data.path.append(nltk_data_path)
 # Check if 'punkt' is already downloaded, if not, download it
 if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
     nltk.download('punkt', download_dir=nltk_data_path)
+# Check if 'stopwords' is already downloaded, if not, download it
+if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
+    nltk.download('stopwords', download_dir=nltk_data_path)
+# Now you can safely use word_tokenize and stopwords
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+stop_words = set(stopwords.words('english'))
 # 1. Remove punctuation
 def remove_punctuation(text):