kritsadaK commited on
Commit
07f46a5
·
1 Parent(s): db87ccc

Fix punkt download and path issue

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -13,25 +13,26 @@ import os
13
  from nltk.tokenize import word_tokenize
14
  from nltk.corpus import stopwords
15
  from nltk.stem import WordNetLemmatizer # Import WordNetLemmatizer
 
 
16
 
17
  # Set NLTK data path to the local 'nltk_data' folder in your project
18
  nltk_data_path = 'nltk_data'
19
  nltk.data.path.append(nltk_data_path)
20
 
21
- # Check if 'stopwords' is already downloaded, if not, download it
22
- if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
23
- nltk.download('stopwords', download_dir=nltk_data_path)
24
-
25
  # Check if 'punkt' is already downloaded, if not, download it
26
  if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
27
  nltk.download('punkt', download_dir=nltk_data_path)
28
 
29
- # Check if 'wordnet' is already downloaded, if not, download it
30
- if not os.path.exists(os.path.join(nltk_data_path, 'corpora/wordnet')):
31
- nltk.download('wordnet', download_dir=nltk_data_path)
 
 
 
 
32
 
33
- # Initialize the lemmatizer
34
- lemmatizer = WordNetLemmatizer()
35
 
36
  # 1. Remove punctuation
37
  def remove_punctuation(text):
 
13
  from nltk.tokenize import word_tokenize
14
  from nltk.corpus import stopwords
15
  from nltk.stem import WordNetLemmatizer # Import WordNetLemmatizer
16
+ import nltk
17
+ import os
18
 
19
  # Set NLTK data path to the local 'nltk_data' folder in your project
20
  nltk_data_path = 'nltk_data'
21
  nltk.data.path.append(nltk_data_path)
22
 
 
 
 
 
23
  # Check if 'punkt' is already downloaded, if not, download it
24
  if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
25
  nltk.download('punkt', download_dir=nltk_data_path)
26
 
27
+ # Check if 'stopwords' is already downloaded, if not, download it
28
+ if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
29
+ nltk.download('stopwords', download_dir=nltk_data_path)
30
+
31
+ # Now you can safely use word_tokenize and stopwords
32
+ from nltk.tokenize import word_tokenize
33
+ from nltk.corpus import stopwords
34
 
35
+ stop_words = set(stopwords.words('english'))
 
36
 
37
  # 1. Remove punctuation
38
  def remove_punctuation(text):