<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href='static/style.xsl' type='text/xsl'?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-05-21T20:28:38Z</responseDate><request verb="GetRecord" identifier="oai:www.clarin.si:11356/1052" metadataPrefix="oai_dc">http://www.clarin.si/repository/oai/request</request><GetRecord><record><header><identifier>oai:www.clarin.si:11356/1052</identifier><datestamp>2025-03-18T11:05:36Z</datestamp><setSpec>hdl_11356_1023</setSpec><setSpec>hdl_11356_1024</setSpec></header><metadata><oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:doc="http://www.lyncode.com/xoai" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>Training corpus ssj500k 1.4</dc:title>
<dc:creator>Krek, Simon</dc:creator>
<dc:creator>Dobrovoljc, Kaja</dc:creator>
<dc:creator>Erjavec, Tomaž</dc:creator>
<dc:creator>Može, Sara</dc:creator>
<dc:creator>Ledinek, Nina</dc:creator>
<dc:creator>Holz, Nanika</dc:creator>
<dc:subject>tagging</dc:subject>
<dc:subject>dependency treebank</dc:subject>
<dc:subject>parsing</dc:subject>
<dc:subject>named entities</dc:subject>
<dc:subject>tokenisation</dc:subject>
<dc:subject>manual annotation</dc:subject>
<dc:subject>TEI</dc:subject>
<dc:description>The ssj500k training corpus contains 500,000 words, manually annotated on the levels of tokenization, sentence segmentation, morphosyntactic tagging, lemmatisation, named entities, and, partially, syntactic dependencies. The ssj500k corpus uses the MULTEXT-East / JOS morphosyntactic tagset and the JOS dependency schema and is based on the jos100k and jos1M corpora. Note that this entry updates ssj500k 1.3 by fixing many annotation errors.</dc:description>
<dc:date>2015-10-26</dc:date>
<dc:type>corpus</dc:type>
<dc:identifier>http://hdl.handle.net/11356/1052</dc:identifier>
<dc:language>slv</dc:language>
<dc:relation>http://hdl.handle.net/11356/1029</dc:relation>
<dc:relation>http://hdl.handle.net/11356/1165</dc:relation>
<dc:rights>Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)</dc:rights>
<dc:rights>https://creativecommons.org/licenses/by-nc-sa/4.0/</dc:rights>
<dc:rights>PUB</dc:rights>
<dc:format>application/zip</dc:format>
<dc:format>application/zip</dc:format>
<dc:format>application/zip</dc:format>
<dc:format>text/plain; charset=utf-8</dc:format>
<dc:format>downloadable_files_count: 3</dc:format>
<dc:publisher>Centre for Language Resources and Technologies, University of Ljubljana</dc:publisher>
<dc:source>http://eng.slovenscina.eu/ucni-korpus</dc:source>
</oai_dc:dc>
</metadata></record></GetRecord></OAI-PMH>