File size: 3,236 Bytes
fda57dd
 
2f6a112
fda57dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42eceac
fda57dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2f6a112
fda57dd
 
2f6a112
57db812
fda57dd
57db812
 
fda57dd
42eceac
 
fda57dd
 
 
 
 
 
 
 
 
 
 
 
 
 
57db812
 
2f6a112
 
fda57dd
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from pathlib import Path

import gradio as gr
from jinja2 import Environment
from tokenizers.pre_tokenizers import Whitespace
from transformers import pipeline

from recognizers import DiffAlign, DiffDel


def load_pipeline(model_name_or_path: str = "ZurichNLP/unsup-simcse-xlm-roberta-base"):
    return pipeline("feature-extraction", model=model_name_or_path)


def generate_diff(text_a: str, text_b: str, method: str):
    global my_pipeline
    if my_pipeline is None:
        my_pipeline = load_pipeline()

    if method == "DiffAlign":
        diff = DiffAlign(pipeline=my_pipeline)
        min_value = 0.3758048415184021 - 0.1
        max_value = 1.045647144317627 - 0.1
    elif method == "DiffDel":
        diff = DiffDel(pipeline=my_pipeline)
        min_value = 0.4864141941070556
        max_value = 0.5012983083724976 + 0.025
    else:
        raise ValueError(f"Unknown method: {method}")

    encoding_a = tokenizer.pre_tokenize_str(text_a)
    encoding_b = tokenizer.pre_tokenize_str(text_b)

    result = diff.predict(
        a=" ".join([token[0] for token in encoding_a]),
        b=" ".join([token[0] for token in encoding_b]),
    )

    result.add_whitespace(encoding_a, encoding_b)

    # Normalize labels based on empirical min/max values
    result.labels_a = tuple([(label - min_value) / (max_value - min_value) for label in result.labels_a])
    result.labels_b = tuple([(label - min_value) / (max_value - min_value) for label in result.labels_b])

    # Round labels to range 0, 2, ... 10
    result.labels_a = tuple([round(min(10, label * 10)) for label in result.labels_a])
    result.labels_b = tuple([round(min(10, label * 10)) for label in result.labels_b])

    template_path = Path(__file__).parent / "result_template.html"
    template = Environment().from_string(template_path.read_text())
    html_dir = Path(__file__).parent / "html_out"
    html_dir.mkdir(exist_ok=True)

    html_a = template.render(token_labels=result.token_labels_a)
    html_b = template.render(token_labels=result.token_labels_b)
    return str(html_a), str(html_b)


my_pipeline = None
tokenizer = Whitespace()


with gr.Blocks() as demo:
    preamble = (Path(__file__).parent / "preamble.md").read_text()
    gr.Markdown(preamble)
    with gr.Row():
        text_a = gr.Textbox(label="Text A", value="We'll meet Steve on Wednesday.", lines=2)
        text_b = gr.Textbox(label="Text B", value="We are going to see Mary on Friday.", lines=2)
    with gr.Row():
        method = gr.Dropdown(choices=["DiffAlign", "DiffDel"], label="Comparison Method", value="DiffAlign")
    with gr.Row():
        with gr.Column(variant="panel"):
            output_a = gr.HTML(label="Result for text A", show_label=True)
        with gr.Column(variant="panel"):
            output_b = gr.HTML(label="Result for text B", show_label=True)
    with gr.Row():
        submit_btn = gr.Button(label="Generate Diff")
        submit_btn.click(
            fn=generate_diff,
            inputs=[text_a, text_b, method],
            outputs=[output_a, output_b],
        )
    description = (Path(__file__).parent / "description.md").read_text()
    gr.Markdown(description)


if my_pipeline is None:
    my_pipeline = load_pipeline()
demo.launch()