pminervini commited on
Commit
46bcca0
·
1 Parent(s): 05346b7
src/backend/tasks/halueval/halueval_dialogue.yaml CHANGED
@@ -4,13 +4,19 @@ task: halueval_dialogue
4
  dataset_path: pminervini/HaluEval
5
  dataset_name: dialogue_samples
6
  output_type: generate_until
7
- training_split: data
8
- validation_split: data
9
  test_split: data
10
  num_fewshot: 0
11
  doc_to_text: !function utils.doc_to_text_dialogue
12
  doc_to_target: !function utils.doc_to_target
13
  process_results: !function utils.process_results
 
 
 
 
 
 
14
  metric_list:
15
  - metric: em
16
  aggregation: mean
 
4
  dataset_path: pminervini/HaluEval
5
  dataset_name: dialogue_samples
6
  output_type: generate_until
7
+ training_split: null
8
+ validation_split: null
9
  test_split: data
10
  num_fewshot: 0
11
  doc_to_text: !function utils.doc_to_text_dialogue
12
  doc_to_target: !function utils.doc_to_target
13
  process_results: !function utils.process_results
14
+ generation_kwargs:
15
+ until:
16
+ - "\n"
17
+ - "."
18
+ do_sample: false
19
+ temperature: 0.0
20
  metric_list:
21
  - metric: em
22
  aggregation: mean
src/backend/tasks/halueval/halueval_qa.yaml CHANGED
@@ -4,13 +4,19 @@ task: halueval_qa
4
  dataset_path: pminervini/HaluEval
5
  dataset_name: qa_samples
6
  output_type: generate_until
7
- training_split: data
8
- validation_split: data
9
  test_split: data
10
  num_fewshot: 0
11
  doc_to_text: !function utils.doc_to_text_qa
12
  doc_to_target: !function utils.doc_to_target
13
  process_results: !function utils.process_results
 
 
 
 
 
 
14
  metric_list:
15
  - metric: em
16
  aggregation: mean
 
4
  dataset_path: pminervini/HaluEval
5
  dataset_name: qa_samples
6
  output_type: generate_until
7
+ training_split: null
8
+ validation_split: null
9
  test_split: data
10
  num_fewshot: 0
11
  doc_to_text: !function utils.doc_to_text_qa
12
  doc_to_target: !function utils.doc_to_target
13
  process_results: !function utils.process_results
14
+ generation_kwargs:
15
+ until:
16
+ - "\n"
17
+ - "."
18
+ do_sample: false
19
+ temperature: 0.0
20
  metric_list:
21
  - metric: em
22
  aggregation: mean
src/backend/tasks/halueval/halueval_summarization.yaml CHANGED
@@ -4,13 +4,19 @@ task: halueval_summarization
4
  dataset_path: pminervini/HaluEval
5
  dataset_name: summarization_samples
6
  output_type: generate_until
7
- training_split: data
8
- validation_split: data
9
  test_split: data
10
  num_fewshot: 0
11
  doc_to_text: !function utils.doc_to_text_summarization
12
  doc_to_target: !function utils.doc_to_target
13
  process_results: !function utils.process_results
 
 
 
 
 
 
14
  metric_list:
15
  - metric: em
16
  aggregation: mean
 
4
  dataset_path: pminervini/HaluEval
5
  dataset_name: summarization_samples
6
  output_type: generate_until
7
+ training_split: null
8
+ validation_split: null
9
  test_split: data
10
  num_fewshot: 0
11
  doc_to_text: !function utils.doc_to_text_summarization
12
  doc_to_target: !function utils.doc_to_target
13
  process_results: !function utils.process_results
14
+ generation_kwargs:
15
+ until:
16
+ - "\n"
17
+ - "."
18
+ do_sample: false
19
+ temperature: 0.0
20
  metric_list:
21
  - metric: em
22
  aggregation: mean
src/backend/tasks/xsum/utils.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sacrebleu
2
+ import numpy as np
3
+
4
+ from rouge_score import rouge_scorer, scoring
5
+
6
+
7
+ def process_results(doc, results):
8
+ # (Pdb)doc.keys()
9
+ # dict_keys(['document', 'summary', 'id'])
10
+ # (Pdb++) results
11
+ # [' The Welsh Government has announced
12
+
13
+ # breakpoint()
14
+
15
+ completion = results[0]
16
+ # true_refs, false_refs = doc["correct_answers"], doc["incorrect_answers"]
17
+ # all_refs = true_refs + false_refs
18
+
19
+ document = doc["document"]
20
+ true_refs = [doc["summary"]]
21
+ all_refs = true_refs
22
+
23
+ # ROUGE-N
24
+ rouge_scores = [rouge([ref], [completion]) for ref in all_refs]
25
+ # ROUGE-1
26
+ rouge1_scores = [score["rouge1"] for score in rouge_scores]
27
+ # ROUGE-2
28
+ rouge2_scores = [score["rouge2"] for score in rouge_scores]
29
+ # ROUGE-L
30
+ rougeL_scores = [score["rougeLsum"] for score in rouge_scores]
31
+
32
+ res = {
33
+ "rouge1": rouge1_scores[0],
34
+ "rouge2": rouge2_scores[0],
35
+ "rougeL": rougeL_scores[0],
36
+ }
37
+
38
+ return res
39
+
40
+
41
+ def bleu(refs, preds):
42
+ """
43
+ Returns `t5` style BLEU scores. See the related implementation:
44
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L41
45
+
46
+ :param refs:
47
+ A `list` of `list` of reference `str`s.
48
+ :param preds:
49
+ A `list` of predicted `str`s.
50
+ """
51
+ score = sacrebleu.corpus_bleu(
52
+ preds,
53
+ refs,
54
+ smooth_method="exp",
55
+ smooth_value=0.0,
56
+ force=False,
57
+ lowercase=False,
58
+ tokenize="intl",
59
+ use_effective_order=False,
60
+ ).score
61
+ return score
62
+
63
+
64
+ def rouge(refs, preds):
65
+ """
66
+ Returns `t5` style ROUGE scores. See the related implementation:
67
+ https://github.com/google-research/text-to-text-transfer-transformer/blob/3d10afd51ba97ac29eb66ae701eca274488202f7/t5/evaluation/metrics.py#L68
68
+
69
+ :param refs:
70
+ A `list` of reference `strs`.
71
+ :param preds:
72
+ A `list` of predicted `strs`.
73
+ """
74
+ rouge_types = ["rouge1", "rouge2", "rougeLsum"]
75
+ scorer = rouge_scorer.RougeScorer(rouge_types)
76
+ # Add newlines between sentences to correctly compute `rougeLsum`.
77
+
78
+ def _prepare_summary(summary):
79
+ summary = summary.replace(" . ", ".\n")
80
+ return summary
81
+
82
+ # Accumulate confidence intervals.
83
+ aggregator = scoring.BootstrapAggregator()
84
+ for ref, pred in zip(refs, preds):
85
+ ref = _prepare_summary(ref)
86
+ pred = _prepare_summary(pred)
87
+ aggregator.add_scores(scorer.score(ref, pred))
88
+ result = aggregator.aggregate()
89
+ return {type: result[type].mid.fmeasure * 100 for type in rouge_types}
src/backend/tasks/xsum/xsum.yaml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: xsum
2
+ dataset_path: EdinburghNLP/xsum
3
+ dataset_name: xsum
4
+ output_type: generate_until
5
+ training_split: train
6
+ validation_split: validation
7
+ test_split: test
8
+ doc_to_text: "Document: {{document}}\nSummary:"
9
+ doc_to_target: "{{summary}}"
10
+ # process_docs: !function utils.process_docs
11
+ process_results: !function utils.process_results
12
+ should_decontaminate: True
13
+ doc_to_decontamination_query: document
14
+ generation_kwargs:
15
+ until:
16
+ - "\n"
17
+ - "."
18
+ do_sample: false
19
+ temperature: 0.0
20
+ metric_list:
21
+ - metric: rouge1_max
22
+ aggregation: mean
23
+ higher_is_better: true
24
+ - metric: rouge1_acc
25
+ aggregation: mean
26
+ higher_is_better: true
27
+ - metric: rouge1_diff
28
+ aggregation: mean
29
+ higher_is_better: true
30
+ - metric: rouge2_max
31
+ aggregation: mean
32
+ higher_is_better: true
33
+ - metric: rouge2_acc
34
+ aggregation: mean
35
+ higher_is_better: true
36
+ - metric: rouge2_diff
37
+ aggregation: mean
38
+ higher_is_better: true
39
+ - metric: rougeL_max
40
+ aggregation: mean
41
+ higher_is_better: true
42
+ - metric: rougeL_acc
43
+ aggregation: mean
44
+ higher_is_better: true
45
+ - metric: rougeL_diff
46
+ aggregation: mean
47
+ higher_is_better: true
48
+ metadata:
49
+ - version: 0.0