File size: 3,873 Bytes
89cbc4d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#####################################################
### DOCUMENT PROCESSOR [Keywords]
#####################################################
### Jonathan Wang

# ABOUT: 
# This creates an app to chat with PDFs.

# This is the Keywords
# Which creates keywords based on documents.
#####################################################
### TODO Board:
# TODO(Jonathan Wang): Add Maximum marginal relevance to the merger for better keywords.
# TODO(Jonathan Wang): create own version of Rake keywords

#####################################################
### PROGRAM SETTINGS


#####################################################
### PROGRAM IMPORTS
from __future__ import annotations

from typing import Any, Callable, Optional

# Keywords
# from multi_rake import Rake  # removing because of compile issues and lack of maintainence
import yake
from llama_index.core.bridge.pydantic import Field
from llama_index.core.schema import BaseNode

# Own Modules
from metadata_adder import MetadataAdder

#####################################################
### SCRIPT

def get_keywords(input_text: str) -> str:
    """
    Given a string, get its keywords using RAKE+YAKE w/ Distribution Based Fusion.

    Inputs:
        input_text (str): the input text to get keywords from
        # top_k (int): the number of keywords to get

    Returns:
        str: A list of the keywords, joined into a string.
    """
    # RAKE
    # kw_extractor = Rake()
    # keywords_rake = kw_extractor.apply(input_text)
    # keywords_rake = dict(keywords_rake)
    # YAKE
    kw_extractor = yake.KeywordExtractor(lan="en", dedupLim=0.9, n=3)
    keywords_yake = kw_extractor.extract_keywords(input_text)
    # reorder scores so that higher is better
    keywords_yake = {keyword[0].lower(): (1 - keyword[1]) for keyword in keywords_yake}
    keywords_yake = dict(
        sorted(keywords_yake.items(), key=lambda x: x[1], reverse=True)  # type hinting YAKE is miserable
        )

    # Merge RAKE and YAKE based on scores.
    # keywords_merged = _merge_on_scores(
    #     list(keywords_yake.keys()), 
    #     list(keywords_rake.keys()), 
    #     list(keywords_yake.values()), 
    #     list(keywords_rake.values()), 
    #     a_weight=0.5, 
    #     top_k=top_k
    # )

    # return (list(keywords_rake.keys())[:top_k], list(keywords_yake.keys())[:top_k], keywords_merged)
    return ", ".join(keywords_yake)  # kinda regretting forcing this into a string


class KeywordMetadataAdder(MetadataAdder):
    """Adds keyword metadata to a document.

    Args:
        metadata_name: The name of the metadata to add to the document. Defaults to 'keyword_metadata'.
        keywords_function: A function for keywords, given a source string and the number of keywords to get.
    """

    keywords_function: Callable[[str, int], str] = Field(
        description="The function to use to extract keywords from the text. Input is string and number of keywords to extract. Ouptut is string of keywords.",
        default=get_keywords,
    )
    num_keywords: int = Field(
        default=5,
        description="The number of keywords to extract from the text. Defaults to 5.",
    )

    def __init__(
        self,
        metadata_name: str = "keyword_metadata",
        keywords_function: Callable[[str], str] = get_keywords,
        num_keywords: int = 5,
        **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(metadata_name=metadata_name, keywords_function=keywords_function, num_keywords=num_keywords, **kwargs)  # ah yes i love oop :)

    @classmethod
    def class_name(cls) -> str:
        return "KeywordMetadataAdder"

    def get_node_metadata(self, node: BaseNode) -> str | None:
        if not hasattr(node, "text") or node.text is None:
            return None
        return self.keywords_function(node.get_content(), self.num_keywords)