<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href='static/style.xsl' type='text/xsl'?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-05-21T23:35:34Z</responseDate><request verb="GetRecord" identifier="oai:www.clarin.si:11356/1083" metadataPrefix="oai_dc">http://www.clarin.si/repository/oai/request</request><GetRecord><record><header><identifier>oai:www.clarin.si:11356/1083</identifier><datestamp>2023-03-27T17:01:18Z</datestamp><setSpec>hdl_11356_1023</setSpec><setSpec>hdl_11356_1024</setSpec></header><metadata><oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:doc="http://www.lyncode.com/xoai" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>CMC training corpus Janes-Norm 1.1</dc:title>
<dc:creator>Erjavec, Tomaž</dc:creator>
<dc:creator>Fišer, Darja</dc:creator>
<dc:creator>Čibej, Jaka</dc:creator>
<dc:creator>Arhar Holdt, Špela</dc:creator>
<dc:subject>computer-mediated communication</dc:subject>
<dc:subject>tokenisation</dc:subject>
<dc:subject>word normalisation</dc:subject>
<dc:subject>manual annotation</dc:subject>
<dc:subject>TEI</dc:subject>
<dc:description>Janes-Norm is a manually annotated corpus of Slovene Computer-Mediated Communication (CMC). It is meant as a gold-standard training and testing dataset for tokenisation, sentence segmentation and word normalisation of non-standard Slovene. The corpus is also automatically annotated with morphosyntactic descriptions and lemmas. As the corpus has been carefully manually annotated, it is also suitable for detailed linguistic explorations which require highly accurate and reliable annotations.&#xd;
&#xd;
The corpus is further described in:&#xd;
ERJAVEC, Tomaž, ČIBEJ, Jaka, ARHAR HOLDT, Špela, LJUBEŠIĆ, Nikola, FIŠER, Darja. Gold-standard datasets for annotation of Slovene computer-mediated communication. In Proceedings of RASLAN 2016: Recent Advances in Slavonic Natural Language Processing. Brno: Tribun EU, 2016, pp. 29-40, https://nlp.fi.muni.cz/raslan/raslan16.pdf&#xd;
&#xd;
Note that a related corpus, Janes-Tag is also available, cf. http://hdl.handle.net/11356/1081.</dc:description>
<dc:date>2016-12-28</dc:date>
<dc:type>corpus</dc:type>
<dc:identifier>http://hdl.handle.net/11356/1083</dc:identifier>
<dc:language>slv</dc:language>
<dc:relation>http://hdl.handle.net/11356/1084</dc:relation>
<dc:rights>Creative Commons - Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)</dc:rights>
<dc:rights>https://creativecommons.org/licenses/by-sa/4.0/</dc:rights>
<dc:rights>PUB</dc:rights>
<dc:format>application/pdf</dc:format>
<dc:format>application/pdf</dc:format>
<dc:format>application/zip</dc:format>
<dc:format>application/zip</dc:format>
<dc:format>text/plain; charset=utf-8</dc:format>
<dc:format>downloadable_files_count: 4</dc:format>
<dc:publisher>Jožef Stefan Institute</dc:publisher>
<dc:source>https://nl.ijs.si/janes/</dc:source>
</oai_dc:dc>
</metadata></record></GetRecord></OAI-PMH>