<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href='static/style.xsl' type='text/xsl'?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-05-21T18:10:59Z</responseDate><request verb="GetRecord" identifier="oai:www.clarin.si:11356/2101" metadataPrefix="oai_dc">http://www.clarin.si/repository/oai/request</request><GetRecord><record><header><identifier>oai:www.clarin.si:11356/2101</identifier><datestamp>2026-05-14T15:32:07Z</datestamp><setSpec>hdl_11356_1023</setSpec><setSpec>hdl_11356_1479</setSpec></header><metadata><oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:doc="http://www.lyncode.com/xoai" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>Parallel sense-annotated corpus ELEXIS-WSD 2.0</dc:title>
<dc:creator>Čibej, Jaka</dc:creator>
<dc:creator>Krek, Simon</dc:creator>
<dc:creator>Tiberius, Carole</dc:creator>
<dc:creator>Martelli, Federico</dc:creator>
<dc:creator>Navigli, Roberto</dc:creator>
<dc:creator>Kallas, Jelena</dc:creator>
<dc:creator>Gantar, Polona</dc:creator>
<dc:creator>Koeva, Svetla</dc:creator>
<dc:creator>Nimb, Sanni</dc:creator>
<dc:creator>Sandford Pedersen, Bolette</dc:creator>
<dc:creator>Olsen, Sussi</dc:creator>
<dc:creator>Langemets, Margit</dc:creator>
<dc:creator>Koppel, Kristina</dc:creator>
<dc:creator>Üksik, Tiiu</dc:creator>
<dc:creator>Dobrovoljc, Kaja</dc:creator>
<dc:creator>Ureña-Ruiz, Rafael</dc:creator>
<dc:creator>Sancho-Sánchez, José-Luis</dc:creator>
<dc:creator>Lipp, Veronika</dc:creator>
<dc:creator>Váradi, Tamás</dc:creator>
<dc:creator>Győrffy, András</dc:creator>
<dc:creator>Simon, László</dc:creator>
<dc:creator>Quochi, Valeria</dc:creator>
<dc:creator>Monachini, Monica</dc:creator>
<dc:creator>Frontini, Francesca</dc:creator>
<dc:creator>Tempelaars, Rob</dc:creator>
<dc:creator>Costa, Rute</dc:creator>
<dc:creator>Salgado, Ana</dc:creator>
<dc:creator>Munda, Tina</dc:creator>
<dc:creator>Kosem, Iztok</dc:creator>
<dc:creator>Roblek, Rebeka</dc:creator>
<dc:creator>Kamenšek, Urška</dc:creator>
<dc:creator>Zaranšek, Petra</dc:creator>
<dc:creator>Zgaga, Karolina</dc:creator>
<dc:creator>Ponikvar, Primož</dc:creator>
<dc:creator>Terčon, Luka</dc:creator>
<dc:creator>Jensen, Jonas</dc:creator>
<dc:creator>Flörke, Ida</dc:creator>
<dc:creator>Lorentzen, Henrik</dc:creator>
<dc:creator>Troelsgård, Thomas</dc:creator>
<dc:creator>Blagoeva, Diana</dc:creator>
<dc:creator>Hristov, Dimitar</dc:creator>
<dc:creator>Kolkovska, Sia</dc:creator>
<dc:creator>Muischnek, Kadri</dc:creator>
<dc:creator>Saul, Kertu</dc:creator>
<dc:creator>Jõgi, Karoliina</dc:creator>
<dc:creator>Bon, Mija</dc:creator>
<dc:creator>Stanković, Ranka</dc:creator>
<dc:creator>Krstev, Cvetana</dc:creator>
<dc:creator>Marković, Aleksandra</dc:creator>
<dc:creator>Ikonić Nešić, Milica</dc:creator>
<dc:creator>Giouli, Voula</dc:creator>
<dc:creator>Papanikolaou, Eri</dc:creator>
<dc:creator>Lobzhanidze, Irina</dc:creator>
<dc:creator>Barbu Mititelu, Verginica</dc:creator>
<dc:creator>Popa, Simina</dc:creator>
<dc:creator>Cristiana, Lea</dc:creator>
<dc:creator>Catalin, Mihaila</dc:creator>
<dc:creator>Irimia, Elena</dc:creator>
<dc:creator>Ostroški Anić, Ana</dc:creator>
<dc:creator>Runjaić, Siniša</dc:creator>
<dc:creator>Sviben, Robert</dc:creator>
<dc:creator>Pavić, Martina</dc:creator>
<dc:creator>Filipović Petrović, Ivana</dc:creator>
<dc:creator>Alberski, Bartłomiej</dc:creator>
<dc:creator>Cvetkoski, Vladimir</dc:creator>
<dc:creator>Kanishcheva, Olha</dc:creator>
<dc:creator>Makhachashvili, Rusudan</dc:creator>
<dc:subject>word sense disambiguation</dc:subject>
<dc:subject>parallel corpus</dc:subject>
<dc:subject>sense annotation</dc:subject>
<dc:subject>multilingual</dc:subject>
<dc:description>ELEXIS-WSD is a parallel sense-annotated corpus in which content words (nouns, adjectives, verbs, and adverbs) have been assigned senses. Version 2.0 contains subcorpora with sentences for 17 languages: Bulgarian, Danish, English, Spanish, Estonian, Hungarian, Italian, Dutch, Portuguese, Slovene, Serbian, Croatian, Macedonian, Greek, Romanian, Georgian, and Polish. In addition, it contains manually corrected translations for Ukrainian - these will be processed in future versions. In 2.0, not all corpora cover all annotation layers - a more detailed overview is available in 00README.txt.&#xd;
&#xd;
The corpus was compiled by automatically extracting a set of sentences from WikiMatrix (Schwenk et al., 2019), a large open-access collection of parallel sentences derived from Wikipedia, using an automatic approach based on multilingual sentence embeddings. The sentences were manually validated according to specific formal, lexical and semantic criteria (e.g. by removing incorrect punctuation, morphological errors, notes in square brackets and etymological information typically provided in Wikipedia pages). To obtain a satisfactory semantic coverage, we filtered out sentences with less than 5 words and less than 2 polysemous words were filtered out. Subsequently, in order to obtain datasets in the other nine target languages, for each selected sentence in English, the corresponding WikiMatrix translation into each of the other languages was retrieved. If no translation was available, the English sentence was translated manually. The resulting corpus is comprised of 2,024 sentences for each language.&#xd;
&#xd;
The sentences were tokenized, lemmatized, and tagged with UPOS tags using UDPipe (https://lindat.mff.cuni.cz/services/udpipe/ - see 00README.txt for information on specific models). Senses were annotated using LexTag (https://elexis.babelscape.com/): each content word (noun, verb, adjective, and adverb) was assigned a sense from among the available senses from the sense inventory selected for the language (see below) or BabelNet. Sense inventories were also updated with new senses during annotation. Dependency relations were added with UDPipe 2.15 in version 1.2 and manually validated for Slovene, Georgian, Romanian, and Estonian.&#xd;
&#xd;
List of sense inventories&#xd;
BG: Dictionary of Bulgarian&#xd;
DA: DanNet – The Danish WordNet&#xd;
EN: Open English WordNet&#xd;
ES: Spanish Wiktionary&#xd;
ET: The EKI Combined Dictionary of Estonian&#xd;
HU: The Explanatory Dictionary of the Hungarian Language&#xd;
IT: PSC + Italian WordNet&#xd;
NL: Open Dutch WordNet&#xd;
PT: Portuguese Academy Dictionary (DACL)&#xd;
SL: Digital Dictionary Database of Slovene&#xd;
SR: Serbian WordNet&#xd;
&#xd;
The corpus is available in the CoNLL-U tab-separated format. In order, the columns contain the token ID, its form, its lemma, its UPOS-tag, its XPOS-tag (if available), its morphological features (FEATS), the head of the dependency relation (HEAD), the type of dependency relation (DEPREL); the ninth column (DEPS) is empty; the final MISC column contains the following: the token's whitespace information (whether the token is followed by a whitespace or not; e.g. SpaceAfter=No), the ID of the sense assigned to the token, the index of the multiword expression (if the token is part of an annotated multiword expression), and the index and type of the named entity annotation (currently only available in elexis-wsd-sl and elexis-wsd-en).&#xd;
&#xd;
Each language has a separate sense inventory containing all the senses (and their definitions) used for annotation in the corpus. Not all the senses from the sense inventory are necessarily included in the corpus annotations: for instance, all occurrences of the English noun "bank" in the corpus might be annotated with the sense of "financial institution", but the sense inventory also contains the sense "edge of a river" as well as all other possible senses to disambiguate between.&#xd;
&#xd;
For more information, please refer to 00README.txt.&#xd;
&#xd;
Updates in version 2.0:&#xd;
- Subcorpora for 7 new languages (Serbian, Croatian, Macedonian, Greek, Romanian, Polish, Georgian) and translations for Ukrainian were added.&#xd;
- Sense annotations for ELEXIS-WSD-sl were updated. Additional multiword expression annotations were added according to the PARSEME 2.0 guidelines (see 00README.txt).</dc:description>
<dc:date>2026-04-01</dc:date>
<dc:type>corpus</dc:type>
<dc:identifier>http://hdl.handle.net/11356/2101</dc:identifier>
<dc:language>slv</dc:language>
<dc:language>eng</dc:language>
<dc:language>bul</dc:language>
<dc:language>dan</dc:language>
<dc:language>por</dc:language>
<dc:language>ita</dc:language>
<dc:language>spa</dc:language>
<dc:language>hun</dc:language>
<dc:language>est</dc:language>
<dc:language>nld</dc:language>
<dc:language>srp</dc:language>
<dc:language>hrv</dc:language>
<dc:language>mkd</dc:language>
<dc:language>ell</dc:language>
<dc:language>ukr</dc:language>
<dc:language>pol</dc:language>
<dc:language>kat</dc:language>
<dc:language>ron</dc:language>
<dc:relation>info:eu-repo/grantAgreement/EC/H2020/731015</dc:relation>
<dc:relation>https://elex.link/elex2021/wp-content/uploads/2021/08/eLex_2021_22_pp377-395.pdf</dc:relation>
<dc:relation>http://hdl.handle.net/11356/2029</dc:relation>
<dc:rights>Creative Commons - Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)</dc:rights>
<dc:rights>https://creativecommons.org/licenses/by-sa/4.0/</dc:rights>
<dc:rights>PUB</dc:rights>
<dc:format>text/plain; charset=utf-8</dc:format>
<dc:format>application/zip</dc:format>
<dc:format>downloadable_files_count: 1</dc:format>
<dc:publisher>Jožef Stefan Institute</dc:publisher>
<dc:source>https://elex.is/</dc:source>
</oai_dc:dc>
</metadata></record></GetRecord></OAI-PMH>