Initial commit

Files changed (13) hide show

.gitignore +3 -0
.idea/.gitignore +3 -0
.idea/inspectionProfiles/Project_Default.xml +38 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/keyword-extraction-viet.iml +10 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
keyword_extraction.py +350 -0
model.py +58 -0
named_entities.py +45 -0
process_text.py +111 -0
vietnamese-stopwords-dash.txt +1990 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv
+pretrained-models
+test_file.txt

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# Default ignored files
+/shelf/
+/workspace.xml

.idea/inspectionProfiles/Project_Default.xml ADDED Viewed

	@@ -0,0 +1,38 @@

+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="PyChainedComparisonsInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoreConstantInTheMiddle" value="true" />
+    </inspection_tool>
+    <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
+      <option name="ignoredPackages">
+        <value>
+          <list size="13">
+            <item index="0" class="java.lang.String" itemvalue="scikit-image" />
+            <item index="1" class="java.lang.String" itemvalue="scipy" />
+            <item index="2" class="java.lang.String" itemvalue="scikit-learn" />
+            <item index="3" class="java.lang.String" itemvalue="PyYAML" />
+            <item index="4" class="java.lang.String" itemvalue="python-dateutil" />
+            <item index="5" class="java.lang.String" itemvalue="torch" />
+            <item index="6" class="java.lang.String" itemvalue="numpy" />
+            <item index="7" class="java.lang.String" itemvalue="torchvision" />
+            <item index="8" class="java.lang.String" itemvalue="pandas" />
+            <item index="9" class="java.lang.String" itemvalue="tqdm" />
+            <item index="10" class="java.lang.String" itemvalue="imageio" />
+            <item index="11" class="java.lang.String" itemvalue="matplotlib" />
+            <item index="12" class="java.lang.String" itemvalue="pytz" />
+          </list>
+        </value>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N803" />
+          <option value="N806" />
+          <option value="N802" />
+        </list>
+      </option>
+    </inspection_tool>
+  </profile>
+</component>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/keyword-extraction-viet.iml ADDED Viewed

	@@ -0,0 +1,10 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (keyword-extraction-viet)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/keyword-extraction-viet.iml" filepath="$PROJECT_DIR$/.idea/keyword-extraction-viet.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

keyword_extraction.py ADDED Viewed

	@@ -0,0 +1,350 @@

+from string import punctuation
+import numpy as np
+import torch
+from sklearn.cluster import KMeans
+from named_entities import get_named_entities
+punctuation = [c for c in punctuation if c != "_"]
+punctuation += ["“", "–", ",", "…", "”", "–"]
+# with open('/Users/thaopham/bao-lao-dong-data/utils/vietnamese-stopwords-dash.txt') as f:
+#     stopwords = [w.strip() for w in f.readlines()]
+ethnicity_dict_map = {"H'Mông": "HMông",
+                      "H'mông": "HMông",
+                      "H’mông": "HMông",
+                      "H’Mông": "HMông",
+                      "H’MÔNG": "HMông",
+                      "M'Nông": "MNông",
+                      "M'nông": "MNông",
+                      "M'NÔNG": "MNông",
+                      "M’Nông": "MNông",
+                      "M’NÔNG": "MNông",
+                      "K’Ho": "KHo",
+                      "K’Mẻo": "KMẻo"}
+def sub_sentence(sentence):
+    sent = []
+    start_index = 0
+    while start_index < len(sentence):
+        idx_list = []
+        for p in punctuation:
+            idx = sentence.find(p, start_index)
+            if idx != -1:
+                idx_list.append(idx)
+        if len(idx_list) == 0:
+            sent.append(sentence[start_index:].strip())
+            break
+        end_index = min(idx_list)
+        subsent = sentence[start_index:end_index].strip()
+        if len(subsent) > 0:
+            sent.append(subsent)
+        start_index = end_index + 1
+    return sent
+def check_for_stopwords(ngram, stopwords_ls):
+    for ngram_elem in ngram.split():
+        for w in stopwords_ls:
+            if ngram_elem == w:  # or ngram_elem.lower() == w:
+                return True
+    return False
+def compute_ngram_list(segmentised_doc, ngram_n, stopwords_ls, subsentences=True):
+    if subsentences:
+        output_sub_sentences = []
+        for sentence in segmentised_doc:
+            output_sub_sentences += sub_sentence(sentence)
+    else:
+        output_sub_sentences = segmentised_doc
+    ngram_list = []
+    for sentence in output_sub_sentences:
+        sent = sentence.split()
+        for i in range(len(sent) - (ngram_n) + 1):
+            ngram = ' '.join(sent[i:i + (ngram_n)])
+            if ngram not in ngram_list and not check_for_stopwords(ngram, stopwords_ls):
+                ngram_list.append(ngram)
+    final_ngram_list = []
+    for ngram in ngram_list:
+        contains_number = False
+        for char in ngram:
+            if char.isnumeric():
+                contains_number = True
+                break
+        if not contains_number:
+            final_ngram_list.append(ngram)
+    return final_ngram_list
+def cosine_similarity(a, b):
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+def get_doc_embeddings(segmentised_doc, tokenizer, phobert, stopwords):
+    doc_embedding = torch.zeros(size=(len(segmentised_doc), 768))
+    for i, sentence in enumerate(segmentised_doc):
+        sent_removed_stopwords = ' '.join([word for word in sentence.split() if word not in stopwords])
+        sentence_embedding = tokenizer.encode(sent_removed_stopwords)
+        input_ids = torch.tensor([sentence_embedding])
+        with torch.no_grad():
+            features = phobert(input_ids)
+        if i == 0:
+            doc_embedding[i, :] = 2 * features.pooler_output.flatten()
+        else:
+            doc_embedding[i, :] = features.pooler_output.flatten()
+    return torch.mean(doc_embedding, axis=0)
+def get_segmentised_doc(nlp, rdrsegmenter, title, doc):
+    for i, j in ethnicity_dict_map.items():
+        if title is not None:
+            title = title.replace(i, j)
+        doc = doc.replace(i, j)
+    print(title)
+    # print(doc)
+    segmentised_doc = rdrsegmenter.word_segment(doc)
+    if title is not None:
+        segmentised_doc = rdrsegmenter.word_segment(title) + rdrsegmenter.word_segment(doc)
+    print(segmentised_doc)
+    ne_ls = set(get_named_entities(nlp, doc))
+    print(sorted(ne_ls))
+    segmentised_doc_ne = []
+    for sent in segmentised_doc:
+        for ne in ne_ls:
+            sent = sent.replace(ne, '_'.join(ne.split()))
+        segmentised_doc_ne.append(sent)
+    return ne_ls, segmentised_doc_ne
+def compute_ngram_embeddings(tokenizer, phobert, ngram_list):
+    ngram_embeddings = {}
+    for ngram in ngram_list:
+        ngram_copy = ngram
+        if ngram.isupper():
+            ngram_copy = ngram.lower()
+        word_embedding = tokenizer.encode(ngram_copy)
+        input_ids = torch.tensor([word_embedding])
+        with torch.no_grad():
+            word_features = phobert(input_ids)
+        ngram_embeddings[ngram] = word_features.pooler_output
+    return ngram_embeddings
+def normalised_cosine_similarity(ngram_embedding, document_embedding):
+    similarity_score = cosine_similarity(ngram_embedding, document_embedding)
+    magnitude_ngram = np.linalg.norm(ngram_embedding)
+    magnitude_doc = np.linalg.norm(document_embedding)
+    return similarity_score / np.sqrt(magnitude_ngram * magnitude_doc)
+def compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding):
+    ngram_similarity_dict = {}
+    for ngram in ngram_list:
+        similarity_score = cosine_similarity(ngram_embeddings[ngram], doc_embedding.T).flatten()[0]
+        # similarity_score = normalised_cosine_similarity(ngram_embeddings[ngram], doc_embedding.T).flatten()[0]
+        ngram_similarity_dict[ngram] = similarity_score
+    return ngram_similarity_dict
+# def compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding):
+#     ngram_similarity_dict = {}
+#
+#     for ngram in ngram_list:
+#         score = [cosine_similarity(ngram_embeddings[ngram], doc_embedding.T).flatten()[0]]
+#         for w in ngram.split():
+#             similarity_score = cosine_similarity(ngram_embeddings[w], doc_embedding.T).flatten()[0]
+#             score.append(similarity_score)
+#         ngram_similarity_dict[ngram] = np.mean(score)
+#
+#     return ngram_similarity_dict
+def diversify_result(ngram_result, ngram_embeddings, top_n=5):
+    best_ngrams = sorted(ngram_result, key=ngram_result.get, reverse=True)[:top_n * 4]
+    best_ngram_embeddings = np.array([ngram_embeddings[ngram] for ngram in best_ngrams]).squeeze()
+    vote = {}
+    for niter in range(100):
+        kmeans = KMeans(n_clusters=top_n, init='k-means++', random_state=niter * 2, n_init="auto").fit(
+            best_ngram_embeddings)
+        kmeans_result = kmeans.labels_
+        res = {}
+        for i in range(len(kmeans_result)):
+            if kmeans_result[i] not in res:
+                res[kmeans_result[i]] = []
+            res[kmeans_result[i]].append((best_ngrams[i], ngram_result[best_ngrams[i]]))
+        final_result = [res[k][0] for k in res]
+        for keyword in final_result:
+            if keyword not in vote:
+                vote[keyword] = 0
+            vote[keyword] += 1
+    diversify_result_ls = sorted(vote, key=vote.get, reverse=True)
+    return diversify_result_ls[:top_n]
+def mmr(ngram_result, ngram_embeddings, lambda_=0.7, top_n=5):
+    ngram_result = {key: ngram_result[key] for key in
+                    sorted(ngram_result, key=ngram_result.get, reverse=True)[:top_n * 4]}
+    mmr_result = {}
+    for ngram1 in ngram_result:
+        similary_score_to_doc = ngram_result[ngram1]
+        max_sim = -1
+        # most_similar_ngram = None
+        for ngram2 in ngram_result:
+            if ngram2 != ngram1:
+                similarity_score_to_ngram = \
+                    cosine_similarity(ngram_embeddings[ngram1], ngram_embeddings[ngram2].T).flatten()[0]
+                if ngram2.lower() == ngram1.lower():
+                    similarity_score_to_ngram = 1
+                if similarity_score_to_ngram > max_sim:
+                    max_sim = similarity_score_to_ngram
+                    # most_similar_ngram = ngram2
+        # print(ngram1, most_similar_ngram, max_sim)
+        mmr_result[ngram1] = lambda_ * similary_score_to_doc - (1 - lambda_) * max_sim
+    mmr_result_ls = [(key, mmr_result[key]) for key in mmr_result]
+    mmr_result_ls = sorted(mmr_result_ls, key=lambda x: x[1], reverse=True)
+    return mmr_result_ls[:top_n]
+def remove_duplicates(ngram_result):
+    to_remove = []
+    for ngram in ngram_result:
+        for ngram2 in ngram_result:
+            if ngram not in to_remove and ngram != ngram2 and ngram.lower() == ngram2.lower():
+                new_score = np.mean([ngram_result[ngram], ngram_result[ngram2]])
+                ngram_result[ngram] = new_score
+                to_remove.append(ngram2)
+    for ngram in to_remove:
+        ngram_result.pop(ngram)
+    return ngram_result
+def compute_filtered_text(annotator, title, text):
+    annotated = annotator.annotate_text(text)
+    if title is not None:
+        annotated = annotator.annotate_text(title + '. ' + text)
+    filtered_sentences = []
+    # keep_tags = ['N', 'Np', 'V', 'A']
+    keep_tags = ['N', 'Np', 'V']
+    for key in annotated.keys():
+        sent = ' '.join([dict_['wordForm'] for dict_ in annotated[key] if dict_['posTag'] in keep_tags])
+        filtered_sentences.append(sent)
+    return filtered_sentences
+def get_candidate_ngrams(segmentised_doc, filtered_segmentised_doc, ngram_n, stopwords_ls):
+    # get actual ngrams
+    # segmentised_doc = get_segmentised_doc(nlp, annotator, title, text)
+    actual_ngram_list = compute_ngram_list(segmentised_doc, ngram_n, stopwords_ls, subsentences=True)
+    # get filtered ngrams
+    # filtered_segmentised_doc = compute_filtered_text(annotator, title, text)
+    filtered_ngram_list = compute_ngram_list(filtered_segmentised_doc, ngram_n, stopwords_ls,
+                                             subsentences=False)
+    # get candiate ngrams
+    candidate_ngram = [ngram for ngram in filtered_ngram_list if ngram in actual_ngram_list]
+    return candidate_ngram
+def get_ngram_frequencies(doc_segmentised, ngram_list):
+    ngram_dict_freq = {}
+    for ngram in ngram_list:
+        ngram_n = len(ngram.split())
+        count = 0
+        for sentence in doc_segmentised:
+            sent = sentence.split()
+            for i in range(len(sent) - ngram_n + 1):
+                pair = ' '.join(sent[i:i + ngram_n])
+                if pair == ngram:
+                    count += 1
+        if count > 1:
+            ngram_dict_freq[ngram] = count
+    return ngram_dict_freq
+def extract_keywords(text, title, nlp, annotator, tokenizer, phobert, stopwords_ls, ngram_n=(2, 2), top_n=5,
+                     use_kmeans=False, use_mmr=False):
+    # ROOT_DIR = '/Users/thaopham/bao-lao-dong-data'
+    # with open(f'{ROOT_DIR}/utils/vietnamese-stopwords-dash.txt') as f:
+    #     stopwords = [w.strip() for w in f.readlines()]
+    ngram_low, ngram_high = ngram_n
+    ne_ls, doc_segmentised = get_segmentised_doc(nlp, annotator, title, text)
+    filtered_doc_segmentised = compute_filtered_text(annotator, title, text)
+    # print(doc_segmentised)
+    print("Generate ngram list")
+    ngram_list = set()
+    for n in range(ngram_low, ngram_high + 1):
+        # ngram_list += compute_ngram_list(segmentised_doc=doc_segmentised, ngram_n=n, stopwords_ls=stopwords)
+        # ngram_list.update(compute_ngram_list(segmentised_doc=doc_segmentised, ngram_n=n, stopwords_ls=stopwords))
+        ngram_list.update(get_candidate_ngrams(doc_segmentised, filtered_doc_segmentised, n, stopwords_ls))
+    ngram_list.update([annotator.word_segment(ne)[0] for ne in ne_ls])
+    print(len(ngram_list))
+    ngram_list = get_ngram_frequencies(doc_segmentised, ngram_list)
+    print(len(ngram_list))
+    # print(sorted(ngram_list))
+    print("Generate ngram embeddings")
+    ngram_embeddings = compute_ngram_embeddings(tokenizer, phobert, ngram_list)
+    print("Generate document embeddings")
+    doc_embedding = get_doc_embeddings(filtered_doc_segmentised, tokenizer, phobert, stopwords_ls)
+    ngram_result = compute_ngram_similarity(ngram_list, ngram_embeddings, doc_embedding)
+    ngram_result = remove_duplicates(ngram_result)
+    non_diversified = sorted([(ngram, ngram_result[ngram]) for ngram in ngram_result],
+                             key=lambda x: x[1], reverse=True)[:top_n]
+    # Diversify result
+    if use_kmeans:
+        diversified_kw_kmeans = diversify_result(ngram_result, ngram_embeddings, top_n=top_n)
+        return diversified_kw_kmeans
+    if use_mmr:
+        diversified_kw_mmr = mmr(ngram_result, ngram_embeddings, lambda_=0.85, top_n=top_n)
+        return diversified_kw_mmr
+    return non_diversified
+def highlight(text, keywords):
+    for key in keywords:
+        text = text.replace(f" {key}", f" <mark>{key}</mark>")
+    return text

model.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import py_vncorenlp
+from transformers import AutoTokenizer, pipeline
+import torch
+import os
+from keyword_extraction import extract_keywords
+import sys
+class KeyBERTVi:
+    def __init__(self, stopwords_file_path):
+        self.annotator = py_vncorenlp.VnCoreNLP(annotators=["wseg", "pos"],
+                                                save_dir=f'{dir_path}/pretrained-models/vncorenlp')
+        # model = py_vncorenlp.VnCoreNLP(save_dir='/absolute/path/to/vncorenlp')
+        print("Loading PhoBERT model")
+        self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
+        # use absolute path because torch is cached
+        self.phobert = torch.load(f'{dir_path}/pretrained-models/phobert.pt')
+        self.phobert.eval()
+        print("Loading NER model")
+        ner_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
+        ner_model = torch.load(f'{dir_path}/pretrained-models/ner-vietnamese-electra-base.pt')
+        ner_model.eval()
+        self.ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)
+        with open(stopwords_file_path) as f:
+            self.stopwords = [w.strip() for w in f.readlines()]
+    def extract_keywords(self, title, text, ngram_range=(1, 3), top_n=5):
+        keyword_ls = extract_keywords(text, title,
+                                      self.ner_pipeline,
+                                      self.annotator,
+                                      self.phobert_tokenizer,
+                                      self.phobert,
+                                      self.stopwords,
+                                      ngram_n=ngram_range,
+                                      top_n=top_n)
+        return keyword_ls
+dir_path = os.path.dirname(os.path.realpath(__file__))
+if __name__ == "__main__":
+    # args
+    print(dir_path)
+    stopwords_file_path = f'{dir_path}/vietnamese-stopwords-dash.txt'
+    text_file_path = sys.argv[1]
+    with open(f'{dir_path}/{text_file_path}', 'r') as f:
+        text = ' '.join([ln.strip() for ln in f.readlines()])
+        print(text)
+    kw_model = KeyBERTVi(stopwords_file_path)
+    title = None
+    keyword_ls = kw_model.extract_keywords(title, text, ngram_range=(1, 3), top_n=5)
+    print(keyword_ls)

named_entities.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
+from underthesea import sent_tokenize
+import torch
+def substring(w, ls):
+    for w2 in ls:
+        if w != w2 and w in w2:
+            return True
+    return False
+def get_ner_phrases(sent_ner_result):
+    ner_list = []
+    current_ner = [sent_ner_result[0]["word"]]
+    current_idx = sent_ner_result[0]["index"]
+    for i in range(1, len(sent_ner_result)):
+        if sent_ner_result[i]["index"] == current_idx + 1:
+            current_ner.append(sent_ner_result[i]["word"])
+        else:
+            ner_list.append((' '.join(current_ner), sent_ner_result[i - 1]['entity']))
+            current_ner = [sent_ner_result[i]["word"]]
+        current_idx = sent_ner_result[i]["index"]
+    ner_list.append((' '.join(current_ner), sent_ner_result[len(sent_ner_result) - 1]['entity']))
+    return ner_list
+def get_named_entities(nlp, doc):
+    ner_lists = []
+    for sent in sent_tokenize(doc):
+        sent_ner_result = nlp(sent)
+        if len(sent_ner_result) > 0:
+            ner_lists += get_ner_phrases(sent_ner_result)
+    # print(ner_lists)
+    ner_list_non_dup = []
+    for (entity, ner_type) in ner_lists:
+        if entity not in ner_list_non_dup and ner_type.startswith('I'):
+            ner_list_non_dup.append(entity)
+    ner_list_final = [w.replace(" ##", "") for w in ner_list_non_dup if not substring(w, ner_list_non_dup)]
+    return ner_list_final

process_text.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from string import punctuation
+import re
+def process_article_content(text):
+    full_text_processed = replace_all(text.strip())
+    while '\n\n' in full_text_processed:
+        full_text_processed = full_text_processed.replace('\n\n', '\n')
+    full_text_processed = process_sticking_sentences(full_text_processed)
+    pattern = "Ảnh: [A-ZĐÀÁÂÃÈÉÊẾÌÍÒÓÔÕÙÚĂĐĨŨƠƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰỲỴÝỶỸa-zđ][a-zàáâãèéêếìíòóôõùúăđĩũơưăạảấầẩẫậắằẳẵặẹẻẽềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ]+ [A-ZĐÀÁÂÃÈÉÊẾÌÍÒÓÔÕÙÚĂĐĨŨƠƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰỲỴÝỶỸa-zđ][a-zàáâãèéêếìíòóôõùúăđĩũơưăạảấầẩẫậắằẳẵặẹẻẽềểễệỉịọỏốồổỗộớờởỡợụủứừửữựỳỵỷỹ]+\.?"
+    full_text_processed = re.sub(pattern, '', full_text_processed)
+    pattern = "Ảnh: [A-ZĐÀÁÂÃÈÉÊẾÌÍÒÓÔÕÙÚĂĐĨŨƠƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỂỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪỬỮỰỲỴÝỶỸ]+\.?"
+    full_text_processed = re.sub(pattern, '', full_text_processed)
+    while '  ' in full_text_processed:
+        full_text_processed = full_text_processed.replace('  ', ' ')
+    return full_text_processed
+def replace_all(text):
+    dict_map = {
+        "òa": "oà",
+        "Òa": "Oà",
+        "ÒA": "OÀ",
+        "óa": "oá",
+        "Óa": "Oá",
+        "ÓA": "OÁ",
+        "ỏa": "oả",
+        "Ỏa": "Oả",
+        "ỎA": "OẢ",
+        "õa": "oã",
+        "Õa": "Oã",
+        "ÕA": "OÃ",
+        "ọa": "oạ",
+        "Ọa": "Oạ",
+        "ỌA": "OẠ",
+        "òe": "oè",
+        "Òe": "Oè",
+        "ÒE": "OÈ",
+        "óe": "oé",
+        "Óe": "Oé",
+        "ÓE": "OÉ",
+        "ỏe": "oẻ",
+        "Ỏe": "Oẻ",
+        "ỎE": "OẺ",
+        "õe": "oẽ",
+        "Õe": "Oẽ",
+        "ÕE": "OẼ",
+        "ọe": "oẹ",
+        "Ọe": "Oẹ",
+        "ỌE": "OẸ",
+        "ùy": "uỳ",
+        "Ùy": "Uỳ",
+        "ÙY": "UỲ",
+        "úy": "uý",
+        "Úy": "Uý",
+        "ÚY": "UÝ",
+        "ủy": "uỷ",
+        "Ủy": "Uỷ",
+        "ỦY": "UỶ",
+        "ũy": "uỹ",
+        "Ũy": "Uỹ",
+        "ŨY": "UỸ",
+        "ụy": "uỵ",
+        "Ụy": "Uỵ",
+        "ỤY": "UỴ",
+        "\xa0": " ",
+        "…": "...",
+        "''": '"',
+        "&#34;": '"',
+        "&#39;": "'",
+        "H'Mông": "Hmông",
+        "H'mông": "Hmông",
+        "H’mông": "Hmông",
+        "H’Mông": "Hmông",
+        "H’MÔNG": "Hmông",
+        "M'Nông": "Mnông",
+        "M'nông": "Mnông",
+        "M'NÔNG": "Mnông",
+        "M’Nông": "Mnông",
+        "M’NÔNG": "Mnông",
+        '\u200b\u200b': ""
+    }
+    for i, j in dict_map.items():
+        text = text.replace(i, j)
+    return text
+def process_sticking_sentences(full_text):
+    for i in range(len(full_text) - 1):
+        c1 = full_text[i]
+        c2 = full_text[i + 1]
+        # 'end of sentence.Start'
+        if c1 in punctuation and c2.isalpha() and c2.isupper():
+            before = full_text[:i + 1]
+            after = full_text[i + 1:]
+            full_text = before + " " + after
+        # 'end of sentenceStart'
+        if c1.isalpha() and c1.islower() and c2.isalpha() and c2.isupper():
+            before = full_text[:i + 1]
+            after = full_text[i + 1:]
+            full_text = before + ". " + after
+    return full_text

vietnamese-stopwords-dash.txt ADDED Viewed

	@@ -0,0 +1,1990 @@

+a_lô
+a_ha
+ai
+ai_ai
+ai_nấy
+ai_đó
+alô
+amen
+anh
+anh_ấy
+ba
+ba_ba
+ba_bản
+ba_cùng
+ba_họ
+ba_ngày
+ba_ngôi
+ba_tăng
+bao_giờ
+bao_lâu
+bao_nhiêu
+bao_nả
+bay_biến
+biết
+biết_bao
+biết_bao_nhiêu
+biết_chắc
+biết_chừng_nào
+biết_mình
+biết_mấy
+biết_thế
+biết_trước
+biết_việc
+biết_đâu
+biết_đâu_chừng
+biết_đâu_đấy
+biết_được
+buổi
+buổi_làm
+buổi_mới
+buổi_ngày
+buổi_sớm
+bà
+bà_ấy
+bài
+bài_bác
+bài_bỏ
+bài_cái
+bác
+bán
+bán_cấp
+bán_dạ
+bán_thế
+bây_bẩy
+bây_chừ
+bây_giờ
+bây_nhiêu
+bèn
+béng
+bên
+bên_bị
+bên_có
+bên_cạnh
+bông
+bước
+bước_khỏi
+bước_tới
+bước_đi
+bạn
+bản
+bản_bộ
+bản_riêng
+bản_thân
+bản_ý
+bất_chợt
+bất_cứ
+bất_giác
+bất_kì
+bất_kể
+bất_kỳ
+bất_luận
+bất_ngờ
+bất_nhược
+bất_quá
+bất_quá_chỉ
+bất_thình_lình
+bất_tử
+bất_đồ
+bấy
+bấy_chầy
+bấy_chừ
+bấy_giờ
+bấy_lâu
+bấy_lâu_nay
+bấy_nay
+bấy_nhiêu
+bập_bà_bập_bõm
+bập_bõm
+bắt_đầu
+bắt_đầu_từ
+bằng
+bằng_cứ
+bằng_không
+bằng_người
+bằng_nhau
+bằng_như
+bằng_nào
+bằng_nấy
+bằng_vào
+bằng_được
+bằng_ấy
+bển
+bệt
+bị
+bị_chú
+bị_vì
+bỏ
+bỏ_bà
+bỏ_cha
+bỏ_cuộc
+bỏ_không
+bỏ_lại
+bỏ_mình
+bỏ_mất
+bỏ_mẹ
+bỏ_nhỏ
+bỏ_quá
+bỏ_ra
+bỏ_riêng
+bỏ_việc
+bỏ_xa
+bỗng
+bỗng_chốc
+bỗng_dưng
+bỗng_không
+bỗng_nhiên
+bỗng_nhưng
+bỗng_thấy
+bỗng_đâu
+bộ
+bộ_thuộc
+bộ_điều
+bội_phần
+bớ
+bởi
+bởi_ai
+bởi_chưng
+bởi_nhưng
+bởi_sao
+bởi_thế
+bởi_thế_cho_nên
+bởi_tại
+bởi_vì
+bởi_vậy
+bởi_đâu
+bức
+cao
+cao_lâu
+cao_ráo
+cao_răng
+cao_sang
+cao_số
+cao_thấp
+cao_thế
+cao_xa
+cha
+cha_chả
+chao_ôi
+chia_sẻ
+chiếc
+cho
+cho_biết
+cho_chắc
+cho_hay
+cho_nhau
+cho_nên
+cho_rằng
+cho_rồi
+cho_thấy
+cho_tin
+cho_tới
+cho_tới_khi
+cho_về
+cho_ăn
+cho_đang
+cho_được
+cho_đến
+cho_đến_khi
+cho_đến_nỗi
+choa
+chu_cha
+chui_cha
+chung
+chung_cho
+chung_chung
+chung_cuộc
+chung_cục
+chung_nhau
+chung_qui
+chung_quy
+chung_quy_lại
+chung_ái
+chuyển
+chuyển_tự
+chuyển_đạt
+chuyện
+chuẩn_bị
+chành_chạnh
+chí_chết
+chính
+chính_bản
+chính_giữa
+chính_là
+chính_thị
+chính_điểm
+chùn_chùn
+chùn_chũn
+chú
+chú_dẫn
+chú_khách
+chú_mày
+chú_mình
+chúng
+chúng_mình
+chúng_ta
+chúng_tôi
+chúng_ông
+chăn_chắn
+chăng
+chăng_chắc
+chăng_nữa
+chơi
+chơi_họ
+chưa
+chưa_bao_giờ
+chưa_chắc
+chưa_có
+chưa_cần
+chưa_dùng
+chưa_dễ
+chưa_kể
+chưa_tính
+chưa_từng
+chầm_chập
+chậc
+chắc
+chắc_chắn
+chắc_dạ
+chắc_hẳn
+chắc_lòng
+chắc_người
+chắc_vào
+chắc_ăn
+chẳng_lẽ
+chẳng_những
+chẳng_nữa
+chẳng_phải
+chết_nỗi
+chết_thật
+chết_tiệt
+chỉ
+chỉ_chính
+chỉ_có
+chỉ_là
+chỉ_tên
+chỉn
+chị
+chị_bộ
+chị_ấy
+chịu
+chịu_chưa
+chịu_lời
+chịu_tốt
+chịu_ăn
+chọn
+chọn_bên
+chọn_ra
+chốc_chốc
+chớ
+chớ_chi
+chớ_gì
+chớ_không
+chớ_kể
+chớ_như
+chợt
+chợt_nghe
+chợt_nhìn
+chủn
+chứ
+chứ_ai
+chứ_còn
+chứ_gì
+chứ_không
+chứ_không_phải
+chứ_lại
+chứ_lị
+chứ_như
+chứ_sao
+coi_bộ
+coi_mòi
+con
+con_con
+con_dạ
+con_nhà
+con_tính
+cu_cậu
+cuối
+cuối_cùng
+cuối_điểm
+cuốn
+cuộc
+càng
+càng_càng
+càng_hay
+cá_nhân
+các
+các_cậu
+cách
+cách_bức
+cách_không
+cách_nhau
+cách_đều
+cái
+cái_gì
+cái_họ
+cái_đã
+cái_đó
+cái_ấy
+cây_nước
+còn
+còn_như
+còn_nữa
+còn_thời_gian
+còn_về
+có
+có_ai
+có_chuyện
+có_chăng
+có_chăng_là
+có_chứ
+có_cơ
+có_dễ
+có_họ
+có_khi
+có_ngày
+có_người
+có_nhiều
+có_nhà
+có_phải
+có_số
+có_tháng
+có_thế
+có_thể
+có_vẻ
+có_ý
+có_ăn
+có_điều
+có_điều_kiện
+có_đáng
+có_đâu
+có_được
+cóc_khô
+cô
+cô_mình
+cô_quả
+cô_tăng
+cô_ấy
+công_nhiên
+cùng
+cùng_chung
+cùng_cực
+cùng_nhau
+cùng_tuổi
+cùng_tột
+cùng_với
+cùng_ăn
+căn
+căn_cái
+căn_cắt
+căn_tính
+cũng
+cũng_như
+cũng_nên
+cũng_thế
+cũng_vậy
+cũng_vậy_thôi
+cũng_được
+cơ
+cơ_chỉ
+cơ_chừng
+cơ_cùng
+cơ_dẫn
+cơ_hồ
+cơ_hội
+cơ_mà
+cơn
+cả
+cả_nghe
+cả_nghĩ
+cả_ngày
+cả_người
+cả_nhà
+cả_năm
+cả_thảy
+cả_thể
+cả_tin
+cả_ăn
+cả_đến
+cảm_thấy
+cảm_ơn
+cấp
+cấp_số
+cấp_trực_tiếp
+cần
+cần_cấp
+cần_gì
+cần_số
+cật_lực
+cật_sức
+cậu
+cổ_lai
+cụ_thể
+cụ_thể_là
+cụ_thể_như
+của
+của_ngọt
+của_tin
+cứ
+cứ_như
+cứ_việc
+cứ_điểm
+cực_lực
+do
+do_vì
+do_vậy
+do_đó
+duy
+duy_chỉ
+duy_có
+dài
+dài_lời
+dài_ra
+dành
+dành_dành
+dào
+dì
+dù
+dù_cho
+dù_dì
+dù_gì
+dù_rằng
+dù_sao
+dùng
+dùng_cho
+dùng_hết
+dùng_làm
+dùng_đến
+dưới
+dưới_nước
+dạ
+dạ_bán
+dạ_con
+dạ_dài
+dạ_dạ
+dạ_khách
+dần_dà
+dần_dần
+dầu_sao
+dẫn
+dẫu
+dẫu_mà
+dẫu_rằng
+dẫu_sao
+dễ
+dễ_dùng
+dễ_gì
+dễ_khiến
+dễ_nghe
+dễ_ngươi
+dễ_như_chơi
+dễ_sợ
+dễ_sử_dụng
+dễ_thường
+dễ_thấy
+dễ_ăn
+dễ_đâu
+dở_chừng
+dữ
+dữ_cách
+em
+em_em
+giá_trị
+giá_trị_thực_tế
+giảm
+giảm_chính
+giảm_thấp
+giảm_thế
+giống
+giống_người
+giống_nhau
+giống_như
+giờ
+giờ_lâu
+giờ_này
+giờ_đi
+giờ_đây
+giờ_đến
+giữ
+giữ_lấy
+giữ_ý
+giữa
+giữa_lúc
+gây
+gây_cho
+gây_giống
+gây_ra
+gây_thêm
+gì
+gì_gì
+gì_đó
+gần
+gần_bên
+gần_hết
+gần_ngày
+gần_như
+gần_xa
+gần_đây
+gần_đến
+gặp
+gặp_khó_khăn
+gặp_phải
+gồm
+hay
+hay_biết
+hay_hay
+hay_không
+hay_là
+hay_làm
+hay_nhỉ
+hay_nói
+hay_sao
+hay_tin
+hay_đâu
+hiểu
+hiện_nay
+hiện_tại
+hoàn_toàn
+hoặc
+hoặc_là
+hãy
+hãy_còn
+hơn
+hơn_cả
+hơn_hết
+hơn_là
+hơn_nữa
+hơn_trước
+hầu_hết
+hết
+hết_chuyện
+hết_cả
+hết_của
+hết_nói
+hết_ráo
+hết_rồi
+hết_ý
+họ
+họ_gần
+họ_xa
+hỏi
+hỏi_lại
+hỏi_xem
+hỏi_xin
+hỗ_trợ
+khi
+khi_khác
+khi_không
+khi_nào
+khi_nên
+khi_trước
+khiến
+khoảng
+khoảng_cách
+khoảng_không
+khá
+khá_tốt
+khác
+khác_gì
+khác_khác
+khác_nhau
+khác_nào
+khác_thường
+khác_xa
+khách
+khó
+khó_biết
+khó_chơi
+khó_khăn
+khó_làm
+khó_mở
+khó_nghe
+khó_nghĩ
+khó_nói
+khó_thấy
+khó_tránh
+không
+không_ai
+không_bao_giờ
+không_bao_lâu
+không_biết
+không_bán
+không_chỉ
+không_còn
+không_có
+không_có_gì
+không_cùng
+không_cần
+không_cứ
+không_dùng
+không_gì
+không_hay
+không_khỏi
+không_kể
+không_ngoài
+không_nhận
+không_những
+không_phải
+không_phải_không
+không_thể
+không_tính
+không_điều_kiện
+không_được
+không_đầy
+không_để
+khẳng_định
+khỏi
+khỏi_nói
+kể
+kể_cả
+kể_như
+kể_tới
+kể_từ
+liên_quan
+loại
+loại_từ
+luôn
+luôn_cả
+luôn_luôn
+luôn_tay
+là
+là_cùng
+là_là
+là_nhiều
+là_phải
+là_thế_nào
+là_vì
+là_ít
+làm
+làm_bằng
+làm_cho
+làm_dần_dần
+làm_gì
+làm_lòng
+làm_lại
+làm_lấy
+làm_mất
+làm_ngay
+làm_như
+làm_nên
+làm_ra
+làm_riêng
+làm_sao
+làm_theo
+làm_thế_nào
+làm_tin
+làm_tôi
+làm_tăng
+làm_tại
+làm_tắp_lự
+làm_vì
+làm_đúng
+làm_được
+lâu
+lâu_các
+lâu_lâu
+lâu_nay
+lâu_ngày
+lên
+lên_cao
+lên_cơn
+lên_mạnh
+lên_ngôi
+lên_nước
+lên_số
+lên_xuống
+lên_đến
+lòng
+lòng_không
+lúc
+lúc_khác
+lúc_lâu
+lúc_nào
+lúc_này
+lúc_sáng
+lúc_trước
+lúc_đi
+lúc_đó
+lúc_đến
+lúc_ấy
+lý_do
+lượng
+lượng_cả
+lượng_số
+lượng_từ
+lại
+lại_bộ
+lại_cái
+lại_còn
+lại_giống
+lại_làm
+lại_người
+lại_nói
+lại_nữa
+lại_quả
+lại_thôi
+lại_ăn
+lại_đây
+lấy
+lấy_có
+lấy_cả
+lấy_giống
+lấy_làm
+lấy_lý_do
+lấy_lại
+lấy_ra
+lấy_ráo
+lấy_sau
+lấy_số
+lấy_thêm
+lấy_thế
+lấy_vào
+lấy_xuống
+lấy_được
+lấy_để
+lần
+lần_khác
+lần_lần
+lần_nào
+lần_này
+lần_sang
+lần_sau
+lần_theo
+lần_trước
+lần_tìm
+lớn
+lớn_lên
+lớn_nhỏ
+lời
+lời_chú
+lời_nói
+mang
+mang_lại
+mang_mang
+mang_nặng
+mang_về
+muốn
+mà
+mà_cả
+mà_không
+mà_lại
+mà_thôi
+mà_vẫn
+mình
+mạnh
+mất
+mất_còn
+mọi
+mọi_giờ
+mọi_khi
+mọi_lúc
+mọi_người
+mọi_nơi
+mọi_sự
+mọi_thứ
+mọi_việc
+mối
+mỗi
+mỗi_lúc
+mỗi_lần
+mỗi_một
+mỗi_ngày
+mỗi_người
+một
+một_cách
+một_cơn
+một_khi
+một_lúc
+một_số
+một_vài
+một_ít
+mới
+mới_hay
+mới_rồi
+mới_đây
+mở
+mở_mang
+mở_nước
+mở_ra
+mợ
+mức
+nay
+ngay
+ngay_bây_giờ
+ngay_cả
+ngay_khi
+ngay_khi_đến
+ngay_lúc
+ngay_lúc_này
+ngay_lập_tức
+ngay_thật
+ngay_tức_khắc
+ngay_tức_thì
+ngay_từ
+nghe
+nghe_chừng
+nghe_hiểu
+nghe_không
+nghe_lại
+nghe_nhìn
+nghe_như
+nghe_nói
+nghe_ra
+nghe_rõ
+nghe_thấy
+nghe_tin
+nghe_trực_tiếp
+nghe_đâu
+nghe_đâu_như
+nghe_được
+nghen
+nghiễm_nhiên
+nghĩ
+nghĩ_lại
+nghĩ_ra
+nghĩ_tới
+nghĩ_xa
+nghĩ_đến
+nghỉm
+ngoài
+ngoài_này
+ngoài_ra
+ngoài_xa
+ngoải
+nguồn
+ngày
+ngày_càng
+ngày_cấp
+ngày_giờ
+ngày_ngày
+ngày_nào
+ngày_này
+ngày_nọ
+ngày_qua
+ngày_rày
+ngày_tháng
+ngày_xưa
+ngày_xửa
+ngày_đến
+ngày_ấy
+ngôi
+ngôi_nhà
+ngôi_thứ
+ngõ_hầu
+ngăn_ngắt
+ngươi
+người_hỏi
+người_khác
+người_khách
+người_mình
+người_nghe
+người_người
+người_nhận
+ngọn
+ngọn_nguồn
+ngọt
+ngồi
+ngồi_bệt
+ngồi_không
+ngồi_sau
+ngồi_trệt
+ngộ_nhỡ
+nhanh
+nhanh_lên
+nhanh_tay
+nhau
+nhiên_hậu
+nhiều
+nhiều_ít
+nhiệt_liệt
+nhung_nhăng
+nhà
+nhà_chung
+nhà_khó
+nhà_làm
+nhà_ngoài
+nhà_ngươi
+nhà_tôi
+nhà_việc
+nhân_dịp
+nhân_tiện
+nhé
+nhìn
+nhìn_chung
+nhìn_lại
+nhìn_nhận
+nhìn_theo
+nhìn_thấy
+nhìn_xuống
+nhóm
+nhón_nhén
+như
+như_ai
+như_chơi
+như_không
+như_là
+như_nhau
+như_quả
+như_sau
+như_thường
+như_thế
+như_thế_nào
+như_thể
+như_trên
+như_trước
+như_tuồng
+như_vậy
+như_ý
+nhưng
+nhưng_mà
+nhược_bằng
+nhất
+nhất_loạt
+nhất_luật
+nhất_là
+nhất_mực
+nhất_nhất
+nhất_quyết
+nhất_sinh
+nhất_thiết
+nhất_thì
+nhất_tâm
+nhất_tề
+nhất_đán
+nhất_định
+nhận
+nhận_biết
+nhận_họ
+nhận_làm
+nhận_nhau
+nhận_ra
+nhận_thấy
+nhận_việc
+nhận_được
+nhằm
+nhằm_khi
+nhằm_lúc
+nhằm_vào
+nhằm_để
+nhỉ
+nhỏ
+nhỏ_người
+nhớ
+nhớ_bập_bõm
+nhớ_lại
+nhớ_lấy
+nhớ_ra
+nhờ
+nhờ_chuyển
+nhờ_có
+nhờ_nhờ
+nhờ_đó
+nhỡ_ra
+những
+những_ai
+những_khi
+những_là
+những_lúc
+những_muốn
+những_như
+nào
+nào_cũng
+nào_hay
+nào_là
+nào_phải
+nào_đâu
+nào_đó
+này
+này_nọ
+nên
+nên_chi
+nên_chăng
+nên_làm
+nên_người
+nên_tránh
+nó
+nóc
+nói
+nói_bông
+nói_chung
+nói_khó
+nói_là
+nói_lên
+nói_lại
+nói_nhỏ
+nói_phải
+nói_qua
+nói_ra
+nói_riêng
+nói_rõ
+nói_thêm
+nói_thật
+nói_toẹt
+nói_trước
+nói_tốt
+nói_với
+nói_xa
+nói_ý
+nói_đến
+nói_đủ
+năm
+năm_tháng
+nơi
+nơi_nơi
+nước
+nước_bài
+nước_cùng
+nước_lên
+nước_nặng
+nước_quả
+nước_xuống
+nước_ăn
+nước_đến
+nấy
+nặng
+nặng_căn
+nặng_mình
+nặng_về
+nếu
+nếu_có
+nếu_cần
+nếu_không
+nếu_mà
+nếu_như
+nếu_thế
+nếu_vậy
+nếu_được
+nền
+nọ
+nớ
+nức_nở
+nữa
+nữa_khi
+nữa_là
+nữa_rồi
+oai_oái
+oái
+pho
+phè
+phè_phè
+phía
+phía_bên
+phía_bạn
+phía_dưới
+phía_sau
+phía_trong
+phía_trên
+phía_trước
+phóc
+phót
+phù_hợp
+phăn_phắt
+phương_chi
+phải
+phải_biết
+phải_chi
+phải_chăng
+phải_cách
+phải_cái
+phải_giờ
+phải_khi
+phải_không
+phải_lại
+phải_lời
+phải_người
+phải_như
+phải_rồi
+phải_tay
+phần
+phần_lớn
+phần_nhiều
+phần_nào
+phần_sau
+phần_việc
+phắt
+phỉ_phui
+phỏng
+phỏng_như
+phỏng_nước
+phỏng_theo
+phỏng_tính
+phốc
+phụt
+phứt
+qua
+qua_chuyện
+qua_khỏi
+qua_lại
+qua_lần
+qua_ngày
+qua_tay
+qua_thì
+qua_đi
+quan_trọng
+quan_trọng_vấn_đề
+quan_tâm
+quay
+quay_bước
+quay_lại
+quay_số
+quay_đi
+quá
+quá_bán
+quá_bộ
+quá_giờ
+quá_lời
+quá_mức
+quá_nhiều
+quá_tay
+quá_thì
+quá_tin
+quá_trình
+quá_tuổi
+quá_đáng
+quá_ư
+quả
+quả_là
+quả_thật
+quả_thế
+quả_vậy
+quận
+ra
+ra_bài
+ra_bộ
+ra_chơi
+ra_gì
+ra_lại
+ra_lời
+ra_ngôi
+ra_người
+ra_sao
+ra_tay
+ra_vào
+ra_ý
+ra_điều
+ra_đây
+ren_rén
+riu_ríu
+riêng
+riêng_từng
+riệt
+rày
+ráo
+ráo_cả
+ráo_nước
+ráo_trọi
+rén
+rén_bước
+rích
+rón_rén
+rõ
+rõ_là
+rõ_thật
+rút_cục
+răng
+răng_răng
+rất
+rất_lâu
+rằng
+rằng_là
+rốt_cuộc
+rốt_cục
+rồi
+rồi_nữa
+rồi_ra
+rồi_sao
+rồi_sau
+rồi_tay
+rồi_thì
+rồi_xem
+rồi_đây
+rứa
+sa_sả
+sang
+sang_năm
+sang_sáng
+sang_tay
+sao
+sao_bản
+sao_bằng
+sao_cho
+sao_vậy
+sao_đang
+sau
+sau_chót
+sau_cuối
+sau_cùng
+sau_hết
+sau_này
+sau_nữa
+sau_sau
+sau_đây
+sau_đó
+so
+so_với
+song_le
+suýt
+suýt_nữa
+sáng
+sáng_ngày
+sáng_rõ
+sáng_thế
+sáng_ý
+sì
+sì_sì
+sất
+sắp
+sắp_đặt
+sẽ
+sẽ_biết
+sẽ_hay
+số
+số_cho_biết
+số_cụ_thể
+số_loại
+số_là
+số_phần
+số_thiếu
+sốt_sột
+sớm
+sớm_ngày
+sở_dĩ
+sử_dụng
+sự
+sự_thế
+sự_việc
+tanh
+tanh_tanh
+tay
+tay_quay
+tha_hồ
+tha_hồ_chơi
+tha_hồ_ăn
+than_ôi
+thanh
+thanh_ba
+thanh_chuyển
+thanh_không
+thanh_thanh
+thanh_tính
+thanh_điều_kiện
+thanh_điểm
+thay_đổi
+thay_đổi_tình_trạng
+theo
+theo_bước
+theo_như
+theo_tin
+thi_thoảng
+thiếu
+thiếu_gì
+thiếu_điểm
+thoạt
+thoạt_nghe
+thoạt_nhiên
+thoắt
+thuần
+thuần_ái
+thuộc
+thuộc_bài
+thuộc_cách
+thuộc_lại
+thuộc_từ
+thà
+thà_là
+thà_rằng
+thành_ra
+thành_thử
+thái_quá
+tháng
+tháng_ngày
+tháng_năm
+tháng_tháng
+thêm
+thêm_chuyện
+thêm_giờ
+thêm_vào
+thì
+thì_giờ
+thì_là
+thì_phải
+thì_ra
+thì_thôi
+thình_lình
+thích
+thích_cứ
+thích_thuộc
+thích_tự
+thích_ý
+thím
+thôi
+thôi_việc
+thúng_thắng
+thương_ôi
+thường
+thường_bị
+thường_hay
+thường_khi
+thường_số
+thường_sự
+thường_thôi
+thường_thường
+thường_tính
+thường_tại
+thường_xuất_hiện
+thường_đến
+thảo_hèn
+thảo_nào
+thấp
+thấp_cơ
+thấp_thỏm
+thấp_xuống
+thấy
+thấy_tháng
+thẩy
+thậm
+thậm_chí
+thậm_cấp
+thậm_từ
+thật
+thật_chắc
+thật_là
+thật_lực
+thật_quả
+thật_ra
+thật_sự
+thật_thà
+thật_tốt
+thật_vậy
+thế
+thế_chuẩn_bị
+thế_là
+thế_lại
+thế_mà
+thế_nào
+thế_nên
+thế_ra
+thế_sự
+thế_thì
+thế_thôi
+thế_thường
+thế_thế
+thế_à
+thế_đó
+thếch
+thỉnh_thoảng
+thỏm
+thốc
+thốc_tháo
+thốt
+thốt_nhiên
+thốt_nói
+thốt_thôi
+thộc
+thời_gian
+thời_gian_sử_dụng
+thời_gian_tính
+thời_điểm
+thục_mạng
+thứ
+thứ_bản
+thứ_đến
+thửa
+thực_hiện
+thực_hiện_đúng
+thực_ra
+thực_sự
+thực_tế
+thực_vậy
+tin
+tin_thêm
+tin_vào
+tiếp_theo
+tiếp_tục
+tiếp_đó
+tiện_thể
+toà
+toé_khói
+toẹt
+trong
+trong_khi
+trong_lúc
+trong_mình
+trong_ngoài
+trong_này
+trong_số
+trong_vùng
+trong_đó
+trong_ấy
+tránh
+tránh_khỏi
+tránh_ra
+tránh_tình_trạng
+tránh_xa
+trên
+trên_bộ
+trên_dưới
+trước
+trước_hết
+trước_khi
+trước_kia
+trước_nay
+trước_ngày
+trước_nhất
+trước_sau
+trước_tiên
+trước_tuổi
+trước_đây
+trước_đó
+trả
+trả_của
+trả_lại
+trả_ngay
+trả_trước
+trếu_tráo
+trển
+trệt
+trệu_trạo
+trỏng
+trời_đất_ơi
+trở_thành
+trừ_phi
+trực_tiếp
+trực_tiếp_làm
+tuy
+tuy_có
+tuy_là
+tuy_nhiên
+tuy_rằng
+tuy_thế
+tuy_vậy
+tuy_đã
+tuyệt_nhiên
+tuần_tự
+tuốt_luốt
+tuốt_tuồn_tuột
+tuốt_tuột
+tuổi
+tuổi_cả
+tuổi_tôi
+tà_tà
+tên
+tên_chính
+tên_cái
+tên_họ
+tên_tự
+tênh
+tênh_tênh
+tìm
+tìm_bạn
+tìm_cách
+tìm_hiểu
+tìm_ra
+tình_trạng
+tính
+tính_căn
+tính_phỏng
+tính_từ
+tít_mù
+tò_te
+tôi
+tôi_con
+tông_tốc
+tù_tì
+tăm_tắp
+tăng
+tăng_chúng
+tăng_cấp
+tăng_giảm
+tăng_thêm
+tăng_thế
+tại
+tại_lòng
+tại_nơi
+tại_sao
+tại_tôi
+tại_vì
+tại_đâu
+tại_đây
+tại_đó
+tạo
+tạo_cơ_hội
+tạo_nên
+tạo_ra
+tạo_ý
+tạo_điều_kiện
+tấm
+tấm_bản
+tấm_các
+tấn
+tấn_tới
+tất_cả
+tất_cả_bao_nhiêu
+tất_thảy
+tất_tần_tật
+tất_tật
+tập_trung
+tắp
+tắp_lự
+tắp_tắp
+tọt
+tỏ_ra
+tỏ_vẻ
+tốc_tả
+tối_ư
+tốt
+tốt_bạn
+tốt_bộ
+tốt_hơn
+tốt_mối
+tốt_ngày
+tột
+tột_cùng
+tớ
+tới
+tới_gần
+tới_mức
+tới_nơi
+tới_thì
+tức_thì
+tức_tốc
+từ
+từ_căn
+từ_giờ
+từ_khi
+từ_loại
+từ_nay
+từ_thế
+từ_tính
+từ_tại
+từ_từ
+từ_ái
+từ_điều
+từ_đó
+từ_ấy
+từng
+từng_cái
+từng_giờ
+từng_nhà
+từng_phần
+từng_thời_gian
+từng_đơn_vị
+từng_ấy
+tự
+tự_cao
+tự_khi
+tự_lượng
+tự_tính
+tự_tạo
+tự_vì
+tự_ý
+tự_ăn
+tựu_trung
+veo
+veo_veo
+việc
+việc_gì
+vung_thiên_địa
+vung_tàn_tán
+vung_tán_tàn
+và
+vài
+vài_ba
+vài_người
+vài_nhà
+vài_nơi
+vài_tên
+vài_điều
+vào
+vào_gặp
+vào_khoảng
+vào_lúc
+vào_vùng
+vào_đến
+vâng
+vâng_chịu
+vâng_dạ
+vâng_vâng
+vâng_ý
+vèo
+vèo_vèo
+vì
+vì_chưng
+vì_rằng
+vì_sao
+vì_thế
+vì_vậy
+ví_bằng
+ví_dù
+ví_phỏng
+ví_thử
+vô_hình_trung
+vô_kể
+vô_luận
+vô_vàn
+vùng
+vùng_lên
+vùng_nước
+văng_tê
+vượt
+vượt_khỏi
+vượt_quá
+vạn_nhất
+vả_chăng
+vả_lại
+vấn_đề
+vấn_đề_quan_trọng
+vẫn
+vẫn_thế
+vậy
+vậy_là
+vậy_mà
+vậy_nên
+vậy_ra
+vậy_thì
+vậy_ư
+về
+về_không
+về_phần
+về_sau
+về_tay
+vị_trí
+vị_tất
+vốn_dĩ
+với
+với_lại
+với_nhau
+vở
+vụt
+vừa
+vừa_khi
+vừa_lúc
+vừa_mới
+vừa_qua
+vừa_rồi
+vừa_vừa
+xa
+xa_cách
+xa_gần
+xa_nhà
+xa_tanh
+xa_tắp
+xa_xa
+xa_xả
+xem
+xem_lại
+xem_ra
+xem_số
+xin
+xin_gặp
+xin_vâng
+xiết_bao
+xon_xón
+xoành_xoạch
+xoét
+xoẳn
+xoẹt
+xuất_kì_bất_ý
+xuất_kỳ_bất_ý
+xuể
+xuống
+xăm_xúi
+xăm_xăm
+xăm_xắm
+xảy_ra
+xềnh_xệch
+xệp
+xử_lý
+yêu_cầu
+à
+à_này
+à_ơi
+ào
+ào_vào
+ào_ào
+á
+á_à
+ái
+ái_chà
+ái_dà
+áng
+áng_như
+âu_là
+ít
+ít_biết
+ít_có
+ít_hơn
+ít_khi
+ít_lâu
+ít_nhiều
+ít_nhất
+ít_nữa
+ít_quá
+ít_ra
+ít_thôi
+ít_thấy
+ô_hay
+ô_hô
+ô_kê
+ô_kìa
+ôi_chao
+ôi_thôi
+ông
+ông_nhỏ
+ông_tạo
+ông_từ
+ông_ấy
+ông_ổng
+úi
+úi_chà
+úi_dào
+ý
+ý_chừng
+ý_da
+ý_hoặc
+ăn
+ăn_chung
+ăn_chắc
+ăn_chịu
+ăn_cuộc
+ăn_hết
+ăn_hỏi
+ăn_làm
+ăn_người
+ăn_ngồi
+ăn_quá
+ăn_riêng
+ăn_tay
+ăn_trên
+ăn_về
+đang
+đang_tay
+đang_thì
+điều
+điều_gì
+điều_kiện
+điểm
+điểm_chính
+điểm_gặp
+điểm_đầu_tiên
+đành_đạch
+đáng
+đáng_kể
+đáng_lí
+đáng_lý
+đáng_lẽ
+đáng_số
+đánh_đùng
+đáo_để
+đâu
+đâu_có
+đâu_cũng
+đâu_như
+đâu_nào
+đâu_phải
+đâu_đâu
+đâu_đây
+đâu_đó
+đây
+đây_này
+đây_rồi
+đây_đó
+đã
+đã_hay
+đã_không
+đã_là
+đã_lâu
+đã_thế
+đã_vậy
+đã_đủ
+đó
+đó_đây
+đúng
+đúng_ngày
+đúng_ra
+đúng_tuổi
+đúng_với
+đơn_vị
+đưa
+đưa_cho
+đưa_chuyện
+đưa_em
+đưa_ra
+đưa_tay
+đưa_tin
+đưa_tới
+đưa_vào
+đưa_về
+đưa_xuống
+đưa_đến
+được
+được_cái
+được_lời
+được_nước
+được_tin
+đại_loại
+đại_phàm
+đại_để
+đạt
+đảm_bảo
+đầu_tiên
+đầy
+đầy_năm
+đầy_phè
+đầy_tuổi
+đặc_biệt
+đặt
+đặt_làm
+đặt_mình
+đặt_mức
+đặt_ra
+đặt_trước
+đặt_để
+đến
+đến_bao_giờ
+đến_cùng
+đến_cùng_cực
+đến_cả
+đến_giờ
+đến_gần
+đến_hay
+đến_khi
+đến_lúc
+đến_lời
+đến_nay
+đến_ngày
+đến_nơi
+đến_nỗi
+đến_thì
+đến_thế
+đến_tuổi
+đến_xem
+đến_điều
+đến_đâu
+đều
+đều_bước
+đều_nhau
+đều_đều
+để
+để_cho
+để_giống
+để_không
+để_lòng
+để_lại
+để_mà
+để_phần
+để_được
+để_đến_nỗi
+đối_với
+đồng_thời
+đủ
+đủ_dùng
+đủ_nơi
+đủ_số
+đủ_điều
+đủ_điểm
+ơ
+ơ_hay
+ơ_kìa
+ơi
+ơi_là
+ư
+ạ
+ạ_ơi
+ấy
+ấy_là
+ầu_ơ
+ắt
+ắt_hẳn
+ắt_là
+ắt_phải
+ắt_thật
+ối_dào
+ối_giời
+ối_giời_ơi
+ồ
+ồ_ồ
+ổng
+ớ
+ớ_này
+ờ
+ờ_ờ
+ở
+ở_lại
+ở_như
+ở_nhờ
+ở_năm
+ở_trên
+ở_vào
+ở_đây
+ở_đó
+ở_được
+ủa
+ứ_hự
+ứ_ừ
+ừ
+ừ_nhé
+ừ_thì
+ừ_ào
+ừ_ừ
+ử
+Chỉ
+Các
+Có
+Cần
+Nhưng
+Tuy_nhiên
+Từ
+Cũng
+từ
+đi
+đến
+Cho_dù
+Chủ_yếu
+Còn
+Có_lẽ
+Có_thể
+Dù
+Dĩ_nhiên
+Những
+Và
+Vì
+Tất_nhiên
+cho_dù
+có_lẽ
+lắm
+Hiện_nay
+Tại_đây
+Như_vậy
+Từ
+Vậy
+Ở
+Bởi
+Theo
+Trên
+Việc
+Do_đó
+Hơn_nữa
+Trong
+Bên
+Cùng
+Cả
+ây
+Khi
+Sau
+Mỗi
+Về
+Không_chỉ
+Mặc_dù
+mặc_dù
+Nhiều
+Như
+Do
+Nếu
+Nếu_như
+Hoặc
+Nhờ
+Hiện
+Hiện_tại
+hiện
+cạnh