Fix punkt download and path issue
Browse files
app.py
CHANGED
@@ -13,25 +13,26 @@ import os
|
|
13 |
from nltk.tokenize import word_tokenize
|
14 |
from nltk.corpus import stopwords
|
15 |
from nltk.stem import WordNetLemmatizer # Import WordNetLemmatizer
|
|
|
|
|
16 |
|
17 |
# Set NLTK data path to the local 'nltk_data' folder in your project
|
18 |
nltk_data_path = 'nltk_data'
|
19 |
nltk.data.path.append(nltk_data_path)
|
20 |
|
21 |
-
# Check if 'stopwords' is already downloaded, if not, download it
|
22 |
-
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
|
23 |
-
nltk.download('stopwords', download_dir=nltk_data_path)
|
24 |
-
|
25 |
# Check if 'punkt' is already downloaded, if not, download it
|
26 |
if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
|
27 |
nltk.download('punkt', download_dir=nltk_data_path)
|
28 |
|
29 |
-
# Check if '
|
30 |
-
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/
|
31 |
-
nltk.download('
|
|
|
|
|
|
|
|
|
32 |
|
33 |
-
|
34 |
-
lemmatizer = WordNetLemmatizer()
|
35 |
|
36 |
# 1. Remove punctuation
|
37 |
def remove_punctuation(text):
|
|
|
13 |
from nltk.tokenize import word_tokenize
|
14 |
from nltk.corpus import stopwords
|
15 |
from nltk.stem import WordNetLemmatizer # Import WordNetLemmatizer
|
16 |
+
import nltk
|
17 |
+
import os
|
18 |
|
19 |
# Set NLTK data path to the local 'nltk_data' folder in your project
|
20 |
nltk_data_path = 'nltk_data'
|
21 |
nltk.data.path.append(nltk_data_path)
|
22 |
|
|
|
|
|
|
|
|
|
23 |
# Check if 'punkt' is already downloaded, if not, download it
|
24 |
if not os.path.exists(os.path.join(nltk_data_path, 'tokenizers/punkt')):
|
25 |
nltk.download('punkt', download_dir=nltk_data_path)
|
26 |
|
27 |
+
# Check if 'stopwords' is already downloaded, if not, download it
|
28 |
+
if not os.path.exists(os.path.join(nltk_data_path, 'corpora/stopwords')):
|
29 |
+
nltk.download('stopwords', download_dir=nltk_data_path)
|
30 |
+
|
31 |
+
# Now you can safely use word_tokenize and stopwords
|
32 |
+
from nltk.tokenize import word_tokenize
|
33 |
+
from nltk.corpus import stopwords
|
34 |
|
35 |
+
stop_words = set(stopwords.words('english'))
|
|
|
36 |
|
37 |
# 1. Remove punctuation
|
38 |
def remove_punctuation(text):
|