Spaces:

shach1995
/

news_summarizer

Sleeping

File size: 1,984 Bytes

import MeCab
import re
from sklearn.feature_extraction.text import TfidfVectorizer


class JapaneseTextVectorizer:
    def __init__(self):
        """
        MeCabのTaggerとTF-IDFベクトライザーを初期化
        """
        self.mecab_tagger = MeCab.Tagger()
        self.tfidf_model = TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b', norm=None)
        self.vocab_list = []

    def _extract_nouns(self, text):
        """
        テキストから名詞を抽出

        Parameters:
        - text (str): 名詞を抽出する対象のテキスト

        Returns:
        - nouns (list): 抽出された名詞リスト
        """
        node = self.mecab_tagger.parseToNode(text)
        nouns = []
        while node:
            word = node.surface
            hinshi = node.feature.split(",")[0]
            if hinshi == "名詞":
                if (not word.isnumeric()) and (not re.match(r'^[\u3040-\u309F]+$', word)):
                    # 名詞が数値と平仮名のみの場合は除き、それ以外の名詞を保存
                    nouns.append(word)
            node = node.next
        return nouns

    def fit_transform(self, text):
        """
        テキストをTF-IDF表現に変換
        Parameters:
        - text (str): TF-IDF表現に変換する対象のテキスト
        Returns:
        - tfidf_dict (dict): 単語とそのTF-IDF値を格納した辞書
        """
        nouns = self._extract_nouns(text)
        self.tfidf_model.fit(nouns)
        vocab_text = " ".join(nouns)
        tfidf_vec = self.tfidf_model.transform([vocab_text]).toarray()[0]
        tfidf_dict = dict(zip(self.tfidf_model.get_feature_names_out(), tfidf_vec))
        tfidf_dict = {word: num_val for word, num_val in tfidf_dict.items() if num_val > 0}
        # TF-IDF値で辞書をソートし、上位5つの要素を取得
        top_tfidf = dict(sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)[:5])
        return top_tfidf