Spaces:

danasone
/

ru_errant

Sleeping

App Files Files Community

danasone commited on Apr 2

Commit

a0b78f4

•

1 Parent(s): b037807

test code

Browse files

Files changed (4) hide show

classifier.py +147 -0
merger.py +181 -0
requirements.txt +4 -1
ru_errant.py +117 -18

classifier.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from __future__ import annotations
+from collections import defaultdict
+from string import punctuation
+import Levenshtein
+from errant.edit import Edit
+def edit_to_tuple(edit: Edit, idx: int = 0) -> tuple[int, int, str, str, int]:
+    cor_toks_str = " ".join([tok.text for tok in edit.c_toks])
+    return [edit.o_start, edit.o_end, edit.type, cor_toks_str, idx]
+def classify(edit: Edit) -> list[Edit]:
+    """Classifies an Edit via updating its `type` attribute."""
+    # Insertion and deletion
+    if ((not edit.o_toks and edit.c_toks) or (edit.o_toks and not edit.c_toks)):
+        error_cats = get_one_sided_type(edit.o_toks, edit.c_toks)
+    elif edit.o_toks != edit.c_toks:
+        error_cats = get_two_sided_type(edit.o_toks, edit.c_toks)
+    else:
+        error_cats = {"NA": edit.c_toks[0].text}
+    new_edit_list = []
+    if error_cats:
+        for error_cat, correct_str in error_cats.items():
+            edit.type = error_cat
+            edit_tuple = edit_to_tuple(edit)
+            edit_tuple[3] = correct_str
+            new_edit_list.append(edit_tuple)
+    return new_edit_list
+def get_edit_info(toks):
+    pos = []
+    dep = []
+    morph = dict()
+    for tok in toks:
+        pos.append(tok.tag_)
+        dep.append(tok.dep_)
+        morphs = str(tok.morph).split('|')
+        for m in morphs:
+            if len(m.strip()):
+                k, v = m.strip().split('=')
+                morph[k] = v
+    return pos, dep, morph
+def get_one_sided_type(o_toks, c_toks):
+    """Classifies a zero-to-one or one-to-zero error based on a token list."""
+    pos_list, _, _ = get_edit_info(o_toks if o_toks else c_toks)
+    if "PUNCT" in pos_list or "SPACE" in pos_list:
+        return {"PUNCT": c_toks[0].text if c_toks else ""}
+    return {"SPELL": c_toks[0].text if c_toks else ""}
+def get_two_sided_type(o_toks, c_toks) -> dict[str, str]:
+    """Classifies a one-to-one or one-to-many or many-to-one error based on token lists."""
+    # one-to-one cases
+    if len(o_toks) == len(c_toks) == 1:
+        if (
+            all(char in punctuation + " " for char in o_toks[0].text) and
+            all(char in punctuation + " " for char in c_toks[0].text)
+        ):
+            return {"PUNCT": c_toks[0].text}
+        source_w, correct_w = o_toks[0].text, c_toks[0].text
+        if source_w != correct_w:
+            # if both string are lowercase or both are uppercase,
+            # and there is no "ё" in both, then it may be only "SPELL" error type
+            if (((source_w.islower() and correct_w.islower()) or
+                (source_w.isupper() and correct_w.isupper())) and
+                    "ё" not in source_w + correct_w):
+                return {"SPELL": correct_w}
+            # edits with multiple errors (e.g. SPELL + CASE)
+            # Step 1. Make char-level Levenstein table
+            char_edits = Levenshtein.editops(source_w, correct_w)
+            # Step 2. Classify operations (CASE, YO, SPELL)
+            edits_classified = classify_char_edits(char_edits, source_w, correct_w)
+            # Step 3. Combine the same-typed errors into minimal string pairs
+            separated_edits = get_edit_strings(source_w, correct_w, edits_classified)
+            return separated_edits
+    # one-to-many and many-to-one cases
+    if all(char in punctuation + " " for char in o_toks.text + c_toks.text):
+        return {"PUNCT": c_toks.text}
+    joint_corr_str = " ".join([tok.text for tok in c_toks])
+    joint_corr_str = joint_corr_str.replace("- ", "-").replace(" -", "-")
+    return {"SPELL": joint_corr_str}
+def classify_char_edits(char_edits, source_w, correct_w):
+    """Classifies char-level Levenstein operations into SPELL, YO and CASE."""
+    edits_classified = []
+    for edit in char_edits:
+        if edit[0] == "replace":
+            if "ё" in [source_w[edit[1]], correct_w[edit[2]]]:
+                edits_classified.append((*edit, "YO"))
+            elif source_w[edit[1]].lower() == correct_w[edit[2]].lower():
+                edits_classified.append((*edit, "CASE"))
+            else:
+                if (
+                    (source_w[edit[1]].islower() and correct_w[edit[2]].isupper()) or
+                    (source_w[edit[1]].isupper() and correct_w[edit[2]].islower())
+                ):
+                    edits_classified.append((*edit, "CASE"))
+                edits_classified.append((*edit, "SPELL"))
+        else:
+            edits_classified.append((*edit, "SPELL"))
+    return edits_classified
+def get_edit_strings(source: str, correction: str,
+                     edits_classified: list[tuple]) -> dict[str, str]:
+    """
+    Applies classified (SPELL, YO and CASE) char operations to source word separately.
+    Returns a dict mapping error type to source string with corrections of this type only.
+    """
+    separated_edits = defaultdict(lambda: source)
+    shift = 0  # char position shift to consider on deletions and insertions
+    for edit in edits_classified:
+        edit_type = edit[3]
+        curr_src = separated_edits[edit_type]
+        if edit_type == "CASE":  # SOURCE letter spelled in CORRECTION case
+            if correction[edit[2]].isupper():
+                correction_char = source[edit[1]].upper()
+            else:
+                correction_char = source[edit[1]].lower()
+        else:
+            if edit[0] == "delete":
+                correction_char = ""
+            elif edit[0] == "insert":
+                correction_char = correction[edit[2]]
+            elif source[edit[1]].isupper():
+                correction_char = correction[edit[2]].upper()
+            else:
+                correction_char = correction[edit[2]].lower()
+        if edit[0] == "replace":
+            separated_edits[edit_type] = curr_src[:edit[1] + shift] + correction_char + \
+                curr_src[edit[1]+shift + 1:]
+        elif edit[0] == "delete":
+            separated_edits[edit_type] = curr_src[:edit[1] + shift] + \
+                curr_src[edit[1]+shift + 1:]
+            shift -= 1
+        elif edit[0] == "insert":
+            separated_edits[edit_type] = curr_src[:edit[1] + shift] + correction_char + \
+                curr_src[edit[1]+shift:]
+            shift += 1
+    return dict(separated_edits)

merger.py ADDED Viewed

	@@ -0,0 +1,181 @@

+from __future__ import annotations
+import itertools
+import re
+from string import punctuation
+import Levenshtein
+from errant.alignment import Alignment
+from errant.edit import Edit
+def get_rule_edits(alignment: Alignment) -> list[Edit]:
+    """Groups word-level alignment according to merging rules."""
+    edits = []
+    # Split alignment into groups
+    alignment_groups = group_alignment(alignment, "new")
+    for op, group in alignment_groups:
+        group = list(group)
+        # Ignore M
+        if op == "M":
+            continue
+        # T is always split
+        if op == "T":
+            for seq in group:
+                edits.append(Edit(alignment.orig, alignment.cor, seq[1:]))
+        # Process D, I and S subsequence
+        else:
+            processed = process_seq(group, alignment)
+            # Turn the processed sequence into edits
+            for seq in processed:
+                edits.append(Edit(alignment.orig, alignment.cor, seq[1:]))
+    return edits
+def group_alignment(alignment: Alignment, mode: str = "default") -> list[tuple[str, list[tuple]]]:
+    """
+    Does initial alignment grouping:
+    1. Make groups of MDM, MIM od MSM.
+    2. In remaining operations, make groups of Ms, groups of Ts, and D/I/Ss.
+    Do not group what was on the sides of M[DIS]M: SSMDMS -> [SS, MDM, S], not [MDM, SSS].
+    3. Sort groups by the order in which they appear in the alignment.
+    """
+    if mode == "new":
+        op_groups = []
+        # Format operation types sequence as string to use regex sequence search
+        all_ops_seq = "".join([op[0][0] for op in alignment.align_seq])
+        # Find M[DIS]M groups and merge (need them to detect hyphen vs. space spelling)
+        ungrouped_ids = list(range(len(alignment.align_seq)))
+        for match in re.finditer("M[DIS]M", all_ops_seq):
+            start, end = match.start(), match.end()
+            op_groups.append(("MSM", alignment.align_seq[start:end]))
+            for idx in range(start, end):
+                ungrouped_ids.remove(idx)
+        # Group remaining operations by default rules (groups of M, T and rest)
+        if ungrouped_ids:
+            def get_group_type(operation):
+                return operation if operation in {"M", "T"} else "DIS"
+            curr_group = [alignment.align_seq[ungrouped_ids[0]]]
+            last_oper_type = get_group_type(curr_group[0][0][0])
+            for i, idx in enumerate(ungrouped_ids[1:], start=1):
+                operation = alignment.align_seq[idx]
+                oper_type = get_group_type(operation[0][0])
+                if (oper_type == last_oper_type and
+                        (idx - ungrouped_ids[i-1] == 1 or oper_type in {"M", "T"})):
+                    curr_group.append(operation)
+                else:
+                    op_groups.append((last_oper_type, curr_group))
+                    curr_group = [operation]
+                last_oper_type = oper_type
+            if curr_group:
+                op_groups.append((last_oper_type, curr_group))
+        # Sort groups by the start id of the first group entry
+        op_groups = sorted(op_groups, key=lambda x: x[1][0][1])
+    else:
+        grouped = itertools.groupby(alignment.align_seq,
+                                    lambda x: x[0][0] if x[0][0] in {"M", "T"} else False)
+        op_groups = [(op, list(group)) for op, group in grouped]
+    return op_groups
+def process_seq(seq: list[tuple], alignment: Alignment) -> list[tuple]:
+    """Applies merging rules to previously formed alignment groups (`seq`)."""
+    # Return single alignments
+    if len(seq) <= 1:
+        return seq
+    # Get the ops for the whole sequence
+    ops = [op[0] for op in seq]
+    # Get indices of all start-end combinations in the seq: 012 = 01, 02, 12
+    combos = list(itertools.combinations(range(0, len(seq)), 2))
+    # Sort them starting with largest spans first
+    combos.sort(key=lambda x: x[1] - x[0], reverse=True)
+    # Loop through combos
+    for start, end in combos:
+        # Ignore ranges that do NOT contain a substitution, deletion or insertion.
+        if not any(type_ in ops[start:end + 1] for type_ in ["D", "I", "S"]):
+            continue
+        # Merge all D xor I ops. (95% of human multi-token edits contain S).
+        if set(ops[start:end + 1]) == {"D"} or set(ops[start:end + 1]) == {"I"}:
+            return (process_seq(seq[:start], alignment)
+                    + merge_edits(seq[start:end + 1])
+                    + process_seq(seq[end + 1:], alignment))
+        # Get the tokens in orig and cor.
+        o = alignment.orig[seq[start][1]:seq[end][2]]
+        c = alignment.cor[seq[start][3]:seq[end][4]]
+        if ops[start:end + 1] in [["M", "D", "M"], ["M", "I", "M"], ["M", "S", "M"]]:
+            # merge hyphens
+            if (o[start + 1].text == "-" or c[start + 1].text == "-") and len(o) != len(c):
+                return (process_seq(seq[:start], alignment)
+                        + merge_edits(seq[start:end + 1])
+                        + process_seq(seq[end + 1:], alignment))
+            # if it is not a hyphen-space edit, return only punct edit
+            return seq[start + 1: end]
+        # Merge possessive suffixes: [friends -> friend 's]
+        if o[-1].tag_ == "POS" or c[-1].tag_ == "POS":
+            return (process_seq(seq[:end - 1], alignment)
+                    + merge_edits(seq[end - 1:end + 1])
+                    + process_seq(seq[end + 1:], alignment))
+        # Case changes
+        if o[-1].lower == c[-1].lower:
+            # Merge first token I or D: [Cat -> The big cat]
+            if (start == 0 and
+                    (len(o) == 1 and c[0].text[0].isupper()) or
+                    (len(c) == 1 and o[0].text[0].isupper())):
+                return (merge_edits(seq[start:end + 1])
+                        + process_seq(seq[end + 1:], alignment))
+            # Merge with previous punctuation: [, we -> . We], [we -> . We]
+            if (len(o) > 1 and is_punct(o[-2])) or \
+                    (len(c) > 1 and is_punct(c[-2])):
+                return (process_seq(seq[:end - 1], alignment)
+                        + merge_edits(seq[end - 1:end + 1])
+                        + process_seq(seq[end + 1:], alignment))
+        # Merge whitespace/hyphens: [acat -> a cat], [sub - way -> subway]
+        s_str = re.sub("['-]", "", "".join([tok.lower_ for tok in o]))
+        t_str = re.sub("['-]", "", "".join([tok.lower_ for tok in c]))
+        if s_str == t_str or s_str.replace(" ", "") == t_str.replace(" ", ""):
+            return (process_seq(seq[:start], alignment)
+                    + merge_edits(seq[start:end + 1])
+                    + process_seq(seq[end + 1:], alignment))
+        # Merge same POS or auxiliary/infinitive/phrasal verbs:
+        # [to eat -> eating], [watch -> look at]
+        pos_set = set([tok.pos for tok in o] + [tok.pos for tok in c])
+        if len(o) != len(c) and (len(pos_set) == 1 or pos_set.issubset({"AUX", "PART", "VERB"})):
+            return (process_seq(seq[:start], alignment)
+                    + merge_edits(seq[start:end + 1])
+                    + process_seq(seq[end + 1:], alignment))
+        # Split rules take effect when we get to smallest chunks
+        if end - start < 2:
+            # Split adjacent substitutions
+            if len(o) == len(c) == 2:
+                return (process_seq(seq[:start + 1], alignment)
+                        + process_seq(seq[start + 1:], alignment))
+            # Split similar substitutions at sequence boundaries
+            if ((ops[start] == "S" and char_cost(o[0].text, c[0].text) > 0.75) or
+                    (ops[end] == "S" and char_cost(o[-1].text, c[-1].text) > 0.75)):
+                return (process_seq(seq[:start + 1], alignment)
+                        + process_seq(seq[start + 1:], alignment))
+            # Split final determiners
+            if (end == len(seq) - 1 and
+                    ((ops[-1] in {"D", "S"} and o[-1].pos == "DET") or
+                     (ops[-1] in {"I", "S"} and c[-1].pos == "DET"))):
+                return process_seq(seq[:-1], alignment) + [seq[-1]]
+    return seq
+def is_punct(token) -> bool:
+    return token.text in punctuation
+def char_cost(a: str, b: str) -> float:
+    """Calculate the cost of character alignment; i.e. char similarity."""
+    return Levenshtein.ratio(a, b)
+def merge_edits(seq: list[tuple]) -> list[tuple]:
+    """Merge the input alignment sequence to a single edit span."""
+    if seq:
+        return [("X", seq[0][1], seq[-1][2], seq[0][3], seq[-1][4])]
+    return seq

requirements.txt CHANGED Viewed

	@@ -1 +1,4 @@
1	- git+https://github.com/huggingface/evaluate@main

+git+https://github.com/huggingface/evaluate@main
+git+https://github.com/Askinkaty/errant/@4183e57
+Levenshtein
+ru-core-news-lg @ https://huggingface.co/spacy/ru_core_news_lg/resolve/main/ru_core_news_lg-any-py3-none-any.whl

ru_errant.py CHANGED Viewed

@@ -12,11 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """TODO: Add a description here."""
 import evaluate
 import datasets
 # TODO: Add BibTeX citation
 _CITATION = """\
 @InProceedings{huggingface:module,
@@ -31,7 +46,6 @@ _DESCRIPTION = """\
 This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
 # TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
 Calculates how good are predictions given some references, using certain scores
@@ -57,6 +71,40 @@ Examples:
 BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class RuErrant(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
@@ -70,26 +118,77 @@ class RuErrant(evaluate.Metric):
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
-            features=datasets.Features({
-                'predictions': datasets.Value('int64'),
-                'references': datasets.Value('int64'),
-            }),
             # Homepage of the module for documentation
             homepage="http://module.homepage",
             # Additional links to the codebase or references
-            codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
             reference_urls=["http://path.to.reference.url/new_module"]
         )
     def _download_and_prepare(self, dl_manager):
-        """Optional: download external resources useful to compute the scores"""
-        # TODO: Download external resources if needed
-        pass
-    def _compute(self, predictions, references):
-        """Returns the scores"""
-        # TODO: Compute the different scores of the module
-        accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
-        return {
-            "accuracy": accuracy,
-        }

 # See the License for the specific language governing permissions and
 # limitations under the License.
 """TODO: Add a description here."""
+from __future__ import annotations
+import re
+from collections import Counter, namedtuple
+from typing import Iterable
+from tqdm.auto import tqdm
+from errant.annotator import Annotator
+from errant.commands.compare_m2 import process_edits
+from errant.commands.compare_m2 import evaluate_edits
+from errant.commands.compare_m2 import merge_dict
+from errant.edit import Edit
+import spacy
+from spacy.tokenizer import Tokenizer
+from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex
+import classifier
+import merger
 import evaluate
 import datasets
 # TODO: Add BibTeX citation
 _CITATION = """\
 @InProceedings{huggingface:module,
 This new module is designed to solve this great ML task and is crafted with a lot of care.
 """
 # TODO: Add description of the arguments of the module here
 _KWARGS_DESCRIPTION = """
 Calculates how good are predictions given some references, using certain scores
 BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
+def update_spacy_tokenizer(nlp):
+    """
+    Changes Spacy tokenizer to parse additional patterns.
+    """
+    infix_re = compile_infix_regex(nlp.Defaults.infixes[:-1] + ["\]\("])
+    simple_url_re = re.compile(r'''^https?://''')
+    nlp.tokenizer = Tokenizer(
+        nlp.vocab,
+        prefix_search=compile_prefix_regex(nlp.Defaults.prefixes + ['\\\\\"']).search,
+        suffix_search=compile_suffix_regex(nlp.Defaults.suffixes + ['\\\\']).search,
+        infix_finditer=infix_re.finditer,
+        token_match=None,
+        url_match=simple_url_re.match
+    )
+    return nlp
+def annotate_errors(self, orig: str, cor: str, merging: str = "rules") -> list[Edit]:
+    """
+        Overrides `Annotator.annotate()` function to allow multiple errors per token.
+        This is nesessary to parse combined errors, e.g.:
+            ["werd", "Word"] >>> Errors: ["SPELL", "CASE"]
+        The `classify()` method called inside is implemented in ruerrant_classifier.py
+        (also overrides the original classifier).
+    """
+    alignment = self.annotator.align(orig, cor, False)
+    edits = self.annotator.merge(alignment, merging)
+    classified_edits = []
+    for edit in edits:
+        classified_edits.extend(self.annotator.classify(edit))
+    return sorted(classified_edits, key=lambda x: (x[0], x[2]))
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class RuErrant(evaluate.Metric):
     """TODO: Short description of my evaluation module."""
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
             # This defines the format of each prediction and reference
+            features=datasets.Features(
+                {
+                    "sources": datasets.Value("string", id="sequence"),
+                    "corrections": datasets.Value("string", id="sequence"),
+                    "answers": datasets.Value("string", id="sequence"),
+                }
+            ),
             # Homepage of the module for documentation
             homepage="http://module.homepage",
             # Additional links to the codebase or references
+            codebase_urls=["https://github.com/ai-forever/sage"],
             reference_urls=["http://path.to.reference.url/new_module"]
         )
     def _download_and_prepare(self, dl_manager):
+        self.annotator = Annotator("ru",
+                                   nlp=update_spacy_tokenizer(spacy.load("ru_core_news_lg")),
+                                   merger=merger,
+                                   classifier=classifier)
+    def _compute(self, sources, corrections, answers):
+        """
+        Evaluates iterables of sources, hyp and ref corrections with ERRANT metric.
+        Args:
+            sources (Iterable[str]): an iterable of source texts;
+            corrections (Iterable[str]): an iterable of gold corrections for the source texts;
+            answers (Iterable[str]): an iterable of evaluated corrections for the source texts;
+        Returns:
+            dict[str, tuple[float, ...]]: a dict mapping error categories to the corresponding
+            P, R, F1 metric values.
+        """
+        best_dict = Counter({"tp": 0, "fp": 0, "fn": 0})
+        best_cats = {}
+        sents = zip(sources, corrections, answers)
+        pb = tqdm(sents, desc="Calculating errant metric", total=len(sources))
+        for sent_id, sent in enumerate(pb):
+            src = self.annotator.parse(sent[0])
+            ref = self.annotator.parse(sent[1])
+            hyp = self.annotator.parse(sent[2])
+            # Align hyp and ref corrections and annotate errors
+            hyp_edits = self.annotate_errors(src, hyp)
+            ref_edits = self.annotate_errors(src, ref)
+            # Process the edits for detection/correction based on args
+            ProcessingArgs = namedtuple("ProcessingArgs",
+                                        ["dt", "ds", "single", "multi", "filt", "cse"],
+                                        defaults=[False, False, False, False, [], True])
+            processing_args = ProcessingArgs()
+            hyp_dict = process_edits(hyp_edits, processing_args)
+            ref_dict = process_edits(ref_edits, processing_args)
+            # Evaluate edits and get best TP, FP, FN hyp+ref combo.
+            EvaluationArgs = namedtuple("EvaluationArgs",
+                                        ["beta", "verbose"],
+                                        defaults=[1.0, False])
+            evaluation_args = EvaluationArgs()
+            count_dict, cat_dict = evaluate_edits(
+                hyp_dict, ref_dict, best_dict, sent_id, evaluation_args)
+            # Merge these dicts with best_dict and best_cats
+            best_dict += Counter(count_dict)  # corpus-level TP, FP, FN
+            best_cats = merge_dict(best_cats, cat_dict)  # corpus-level errortype-wise TP, FP, FN
+        cat_prf = {}
+        for cat, values in best_cats.items():
+            tp, fp, fn = values  # fp - extra corrections, fn - missed corrections
+            p = float(tp) / (tp + fp) if tp + fp else 1.0
+            r = float(tp) / (tp + fn) if tp + fn else 1.0
+            f = (2 * p * r) / (p + r) if p + r else 0.0
+            cat_prf[cat] = (p, r, f)
+        for error_category in ["CASE", "PUNCT", "SPELL", "YO"]:
+            if error_category not in cat_prf:
+                cat_prf[error_category] = (1.0, 1.0, 1.0)
+        return cat_prf