File size: 20,926 Bytes
7005a40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Multilingual Summarizer\n",
    "- Feel free to play with models\n",
    "- Gpus recommended due to faster summarization\n",
    "- Firstly, include necessary .py files or clone git repo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "import torch as pt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from collections import OrderedDict\n",
    "\n",
    "from transformers import AutoModelForSeq2SeqLM\n",
    "from transformers import  AutoTokenizer\n",
    "import datasets\n",
    "\n",
    "import logging\n",
    "logging.basicConfig(level=logging.INFO, format='%(asctime)s | %(name)s | %(levelname)s | %(message)s')\n",
    "\n",
    "# These files need to be included\n",
    "from summarization.tokenizer import DatasetTokenizer\n",
    "from summarization.summarizer import Summarizer\n",
    "from summarization.summarization_metrics import MetricsComputation\n",
    "\n",
    "\n",
    "\n",
    "## READ THIS (To run):\n",
    "# clone this git repo: https://gitlab.fel.cvut.cz/factchecking/experimental-marian-krotil\n",
    "# insert this .ipynb file into the -/tree/main/projects/python/git_krotima1/source directory\n",
    "# --> due to the included files / or copy them directly to this notebook\n",
    "\n",
    "class MultiSummarizer:\n",
    "    \n",
    "    ## Constructor\n",
    "    # input:    model_name : string : Huggingface checkpoint (ctu-aic/m2m100-418M-multilingual-summarization-multilarge-cs, ctu-aic/mt5-base-multilingual-summarization-multilarge-cs, ctu-aic/mbart25-multilingual-summarization-multilarge-cs)\n",
    "    #           language : string : cs, en, de, fr, es, tr, ru, zh\n",
    "    #           inference_cfg : dict : parameters of generation method\n",
    "    #\n",
    "    # \n",
    "    #\n",
    "    def __init__(self, model_name, language, inference_cfg=None, **kwargs):\n",
    "        logging.info(f\"Initializing multilingual summarizer {model_name}\")\n",
    "        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)\n",
    "        self.dstTokenizer = DatasetTokenizer(model_name.split('/')[-1], model_name, language)\n",
    "        self.tokenizer  = self.dstTokenizer.get_tokenizer()\n",
    "        self.langid = self.dstTokenizer.get_langid()\n",
    "        self.inference_cfg = inference_cfg\n",
    "        self.enc_max_len = 512\n",
    "        self.language = language\n",
    "        #cuda/cpu device\n",
    "        if pt.cuda.is_available():\n",
    "            self.model.cuda()\n",
    "\n",
    "\n",
    "    ## Function __call__\n",
    "    # input:    texts - (list of strings, string, dataset) - texts in selected language to summarize\n",
    "    #           golds - (None, list of strings, string) - target summary - if provided the ROUGE scores are computed\n",
    "    #           inference_cfg - dictionary with configuration of generation method\n",
    "    #           text_column - if texts is in the dataset type, it is column which will be summarized\n",
    "    #   \n",
    "    # output:   tuple (list of summaries, - {} empty dic if no golds)\n",
    "    #                                     - dict of ROUGE scores if golds are given)\n",
    "    # functionality:\n",
    "    # - converts input to the Huggingface datasets if not provided\n",
    "    # - tokenize & summarize input texts\n",
    "    # - compute scores based on passed arguments\n",
    "    # \n",
    "    def __call__(self, texts, golds=None, inference_cfg=None, text_column=\"\", **kwargs):\n",
    "        \n",
    "        #check input\n",
    "        golds = [golds] if type(golds) == str else golds\n",
    "        assert golds is None or type(golds) == list and type(golds[0]) == str, \"Golds: Expected type: None, string or list of strings\"\n",
    "        \n",
    "        if type(texts) != datasets.Dataset:\n",
    "            texts = [texts] if type(texts) == str else texts\n",
    "            assert type(texts) == list and type(texts[0]) == str, \"Texts: Expected type: dataset, string or list of strings\"\n",
    "            \n",
    "        \n",
    "        self.inference_cfg = inference_cfg if inference_cfg is not None else self.inference_cfg\n",
    "        logging.info(f\"Summarizing data with the generation config: {self.inference_cfg}\")\n",
    "        \n",
    "        #get hgft dataset\n",
    "        dst = datasets.DatasetDict()\n",
    "        if type(texts) != datasets.Dataset:\n",
    "            df = pd.DataFrame({'text': texts})\n",
    "            dst[\"test\"] = datasets.Dataset.from_pandas(df)\n",
    "        else:\n",
    "            dst[\"test\"] = texts\n",
    "            \n",
    "            \n",
    "        #Tokenize input texts\n",
    "        text_column = 'text' if text_column == \"\" else text_column\n",
    "        cfg = {\"text_column\": text_column}\n",
    "        \n",
    "        tok_dst = self.dstTokenizer.tokenize(dst, encoder_input_ids=self.enc_max_len, decoder_input_ids=None,**cfg)[\"test\"]\n",
    "        \n",
    "        #Init Summarizer\n",
    "        summarizer = Summarizer(model = self.model, tokenizer = self.tokenizer,lcode=self.langid, batch_size = 8)\n",
    "        \n",
    "        #Summarize texts\n",
    "        summarizer.summarize_dst(tok_dst,**self.inference_cfg)\n",
    "    \n",
    "        \n",
    "        scores = {}\n",
    "        if golds is not None:\n",
    "            #compute scores if gold texts are provided\n",
    "            metrics = MetricsComputation(self.language)\n",
    "            scores = metrics.compute_scores(gold = golds, summary=summarizer.summarized_dst['summary'])\n",
    "            \n",
    "        \n",
    "        return (summarizer.summarized_dst['summary'], scores)\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "## Configuration of summarization pipeline\n",
    "#\n",
    "def summ_config():\n",
    "    cfg = OrderedDict([\n",
    "        \n",
    "        ## summarization model - checkpoint\n",
    "        #   ctu-aic/m2m100-418M-multilingual-summarization-multilarge-cs\n",
    "        #   ctu-aic/mt5-base-multilingual-summarization-multilarge-cs\n",
    "        #   ctu-aic/mbart25-multilingual-summarization-multilarge-cs\n",
    "        (\"model_name\", \"ctu-aic/m2m100-418M-multilingual-summarization-multilarge-cs\"),\n",
    "        \n",
    "        ## language of summarization task\n",
    "        #   language : string : cs, en, de, fr, es, tr, ru, zh\n",
    "        (\"language\", \"en\"), \n",
    "        \n",
    "        ## generation method parameters in dictionary\n",
    "        #\n",
    "        (\"inference_cfg\", OrderedDict([\n",
    "            (\"num_beams\", 4),\n",
    "            (\"top_k\", 40),\n",
    "            (\"top_p\", 0.92),\n",
    "            (\"do_sample\", True),\n",
    "            (\"temperature\", 0.89),\n",
    "            (\"repetition_penalty\", 1.2),\n",
    "            (\"no_repeat_ngram_size\", None),\n",
    "            (\"early_stopping\", True),\n",
    "            (\"max_length\", 128),\n",
    "            (\"min_length\", 10),\n",
    "        ])),\n",
    "        #texts to summarize values = (list of strings, string, dataset)\n",
    "        (\"texts\",\n",
    "            [\n",
    "                'Havana, Cuba (CNN)All eyes are going to be on the new kid finally allowed to play and the big kid who for so long wanted nothing to do with him -- Cuba and the United States in the same diplomatic playground. Cuba pulled off a diplomatic coup by marshaling the support of other regional countries to insist on their attendance at the Summit of the Americas. And for the first time since 1962, the U.S. has not blocked Cuba\\'s attempt to join. Now it\\'s time to see how they play and who they play with -- especially Venezuela, which often falls out with Washington for crushing dissent at home and supplying Havana with billions of dollars in oil. Cuba is trying to re-establish itself at the two-day summit in Panama, arriving with more than 100 government officials, diplomats, small business people and artists. But Cuba\\'s attempts to rebrand itself as an open, diverse society stumbled Wednesday when government supporters and anti-Castro supporters brawled in the streets of Panama. Video of the incident showed Cuban government officials exchanging punches and insults with dissidents until Panamanian police in riot gear broke up the melee. With the historic thawing in relations between the U.S. and Cuba, Washington now has urgent business to discuss with Havana. \"We have really big issues with the Cubans that do need to be solved,\" said Ambassador Vicki Huddleston, who served as the chief of the U.S. Interests Section in Havana. She added \"The Cubans are typical of their negotiating style. You think it\\'s going to be easy because we have said \\'We are going to have good relations with you\\' and they say, \\'That\\'s not exciting for us and it is for you.\\' So they are hard negotiators as they always have been.\" The forum could provide the opportunity to push forward an agreement to re-establish formal relations and re-open embassies after nearly four months of negotiations. While President Barack Obama is not scheduled to meet Cuban leader Raul Castro, U.S. officials said there will be opportunities for \"interaction\" between the two leaders. The first time the two heads of state met was in 2013 at Nelson Mandela\\'s funeral. Their brief handshake captured the world\\'s attention and lit up social media. Few people then knew that the two countries were secretly involved in negotiations to thaw five decades of deadlocked Cold War-era relations. Obama had said he had hoped a U.S. Embassy would reopen in Havana before the summit, but Cuban officials have said they cannot imagine a full restoration of diplomatic ties until Cuba is removed from the U.S. State Department list of countries that support terrorism. \"It would be difficult to explain that diplomatic relations have been resumed while Cuba has been unjustly listed as a state sponsor of international terrorism,\" said Josefina Vidal, the general director of U.S. affairs at the Cuban Foreign Ministry and lead negotiator in the talks. Cuba was added to the list in 1982, which includes Syria, Iran and Sudan. The designation carries financial sanctions which Cuban officials say further damages their already ailing economy. The State Department has sent a recommendation to the White House that Cuba be removed, paving the way for the White House to announce its intent to de-list Cuba as early as this week, two administration officials told CNN. Removal from the list \"does not relate to whether or not we agree with everything a country does or whether we agree with its political system, or its foreign policy,\" Deputy National Security Adviser Ben Rhodes said on a conference call with reporters Tuesday. \"It\\'s a very practical review as to whether or not a government is sponsoring terrorism.\" Rhodes also dialed backed rhetoric on Venezuela, saying the country did not pose a national security threat to the United States, despite a recent declaration to that effect. The designation was meant to allow officials to target seven allegedly corrupt Venezuelan officials, but it ignited a firestorm, particularly in Cuba, which has close ties to Venezuela. Deceased Venezuelan President Hugo Chavez was a friend and admirer of former Cuban leader Fidel Castro. Chavez\\'s successor Nicolas Maduro continues to send Cuba tens of thousands of barrels of oil each day, despite his country\\'s own economic turmoil. In exchange, Cuba sends doctors, military advisers and sports trainers to Venezuela. In Cuba\\'s state-run media, criticism of U.S. policy towards Venezuela has overshadowed the improvement in U.S.-Cuba relations. In March, Fidel Castro published a letter criticizing the U.S.\\' \"brutal plans towards\" Venezuela and the Cuban government promised \"unconditional aid\" to help defend against American threats. Its remains to be seen how much Cuba will risk its warming relations with the United States to back up ally Venezuela. But apparently there is little doubt among the Cuban people on what their government should do. A poll of 1,200 Cubans released on Wednesday found that 97% of the people surveyed by Miami-based polling firm Bendixen & Amandi on behalf of The Washington Post and Univision Noticias/Fusion supported improved U.S.-Cuban relations.'\n",
    "                ,\n",
    "                '(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed \"in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014.\" Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\\'s ceremony, said it was a move toward greater justice. \"As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice,\" he said, according to an ICC news release. \"Indeed, today brings us closer to our shared goals of justice and peace.\" Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. \"As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly,\" she said. Rights group Human Rights Watch welcomed the development. \"Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\\'s treaty should speak out to welcome its membership,\" said Balkees Jarrah, international justice counsel for the group. \"What\\'s objectionable is the attempts to undermine international justice, not Palestine\\'s decision to join a treaty to which over 100 countries around the world are members.\" In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it \"strongly\" disagreed with the court\\'s decision. \"As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC,\" the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. \"We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace,\" it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as \"Palestine.\" While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would \"conduct its analysis in full independence and impartiality.\" The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'\n",
    "                #\"Tady je deset dní držel! První pohled do šíleného úkrytu unesených dětí. Interiér vůbec neodpovídá běžným garážím – stěny jsou obložené izolací, na zemi dlažba a koberec, dvě matrace. „Byl tam nedodělanej sprchovej kout, měl tam vařák na jídlo, stůl, dvě nový matrace na sobě, deky, polštář, ledničku a záchod,“ popsal serveru iDnes.cz Daniel. Garáž měla troje dveře, jedny na klíč, jedny na číselný kód. Protihluková izolace, kterou byl objekt doslova obalený, byla patrná zejména na střeše a zadní stěně zvenku. Zbývá si položit jednu otázku – jak dlouho únosce celou věc plánoval?Už večer před nalezením dvou unesených dětí byla policie v areálu garáží, kde byli vězněni šestnáctiletý chlapec a o tři roky mladší dívka z Litoměřicka. „Nic nenasvědčovalo tomu, že by se tam nacházeli,“ řekl novinářům náměstek ředitele krajské policie Zbyněk Dvořák. Děti nalezli v garáži na ústeckém Střekově minulou středu dopoledne rodiče. Z únosu dětí je obviněn šestatřicetiletý recidivista Zdeněk H. Děti držel od 28. srpna do 7. září. Policie sestavila speciální tým, který bude prověřovat mimo jiné i to, zda se nedopustil dalších skutků.Policie podle Dvořáka při pátrání prověřovala desítky oznámení, podle kterých měly být děti viděny v různých částech Ústeckého kraje, Česka i v zahraničí. Klíčovou roli sehrály v případu dopisy, které dostaly rodiny 6. září. Kriminalisté se primárně zabývali Doksy a okolí, odkud byl dopis odeslaný. „Navíc chlapec měl vztah k Doksům, byli tam na dovolené. Některé indicie šly tam, že je pravděpodobné, že jsou v té lokalitě. I po téhle stopě se 6. a zejména 7. září ráno šlo,“ vysvětlila vedoucí litoměřické policie Helena Pšeničková.Obsah dopisů zkoumali rodiny i písmoznalci. Řada slov byla opakovaně přeškrtaná a přepsaná. „Slova dávala význam střecha, střela, Střekov, současně probíhalo pátrání na Střeleckém ostrově. Dochází k tomu, že večer dostáváme informaci od rodiny, že relevantní má být opravdu Střekov. Kolem desáté hodiny večer probíhá prověrka na místě v areálu těch garáží. Děti nebyly nalezeny. Nic nenasvědčuje tomu, že se tam v tu dobu nacházejí,“ řekl Dvořák.Druhý den ráno se ke garážím vypravili rodiče chlapce. „Informují nás o tom, že z jedné z těch garáží je slyšet hudba,“ doplnil Dvořák. Po prověření zvukových signálů policie do garáže vnikla.Muži hrozí podle stávající právní kvalifikace až osm let vězení. Čelí obvinění z vydírání, zbavení osobní svobody, ohrožování výchovy mládeže a šíření pornografie.Kriminalisté potvrdili, že podezřelým je majitel garáže na Střekově. Je jím šestatřicetiletý recidivista, který byl už před lety odsouzen za pokusy o únos malých dívek na Českolipsku. Soud ho v sobotu poslal do vazby.\",\n",
    "            ]\n",
    "        ),\n",
    "        #Target summaries values = (list of strings, string, None)\n",
    "        ('golds',\n",
    "         [\n",
    "             \"Cuba pulled off a diplomatic coup by gaining attendance at Summit of the Americas. First time since 1962, the U.S. has not blocked Cuba's attempt to join. Cuba is trying to re-establish itself at the two-day summit in Panama.\",\n",
    "             'Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June. Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis.'\n",
    "         ]),\n",
    "    ])\n",
    "    return cfg\n",
    "\n",
    "cfg = summ_config()\n",
    "msummarizer = MultiSummarizer(**cfg)\n",
    "ret = msummarizer(**cfg)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "python"
    }
   },
   "outputs": [],
   "source": [
    "ret = msummarizer(**cfg)\n",
    "print(ret)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}