<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href='static/style.xsl' type='text/xsl'?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-04-15T12:08:24Z</responseDate><request verb="GetRecord" identifier="oai:www.clarin.si:11356/1215" metadataPrefix="oai_dc">http://www.clarin.si/repository/oai/request</request><GetRecord><record><header><identifier>oai:www.clarin.si:11356/1215</identifier><datestamp>2023-03-27T17:01:19Z</datestamp><setSpec>hdl_11356_1023</setSpec><setSpec>hdl_11356_1024</setSpec></header><metadata><oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:doc="http://www.lyncode.com/xoai" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>Keywords and n-grams from a textbook corpus</dc:title>
<dc:creator>Kosem, Iztok</dc:creator>
<dc:creator>Pori, Eva</dc:creator>
<dc:creator>Arhar Holdt, Špela</dc:creator>
<dc:subject>wordlist</dc:subject>
<dc:subject>n-grams</dc:subject>
<dc:subject>textbook corpus</dc:subject>
<dc:subject>keywords</dc:subject>
<dc:subject>vocabulary</dc:subject>
<dc:subject>school</dc:subject>
<dc:description>Wordlists, keywords and n-grams were extracted from a corpus of textbooks for Slovenian elementary and secondary schools. The corpus contains 4,302,857 words (5,373,268 tokens), and consists of 127 textbooks from 16 different subjects: &#xd;
- Biology (6 textbooks;  293,935 words),&#xd;
- State, society and ethics (1 textbook;  21,881 words),&#xd;
- Society (4 textbooks;  64,126),&#xd;
- Physics (5 textbooks;  185,171),&#xd;
- Geography (7 textbooks;  202,101 words),&#xd;
- Music (8 textbooks;  224,034 words),&#xd;
- Home Economics (3 textbooks;  33.803),&#xd;
- Chemistry (7 textbooks;  282,543 words),&#xd;
- Art (3 textbooks;  146,681),&#xd;
- Mathematics (23 textbooks;  764,012),&#xd;
- Science (5 textbooks;  226,191 words),&#xd;
- Science and technology (6 textbooks;  183,749 words),&#xd;
- Slovene language (37 textbooks;  1,437,945 words),&#xd;
- Environmental Education (7 textbooks;  38,645 words),&#xd;
- Technology (1 textbook;  24,733 words)&#xd;
- History (4 textbooks;  173,307 words). &#xd;
&#xd;
The lists were manually cleaned, most items not found in the reference morphological lexicon Sloleks (http://hdl.handle.net/11356/1039) were removed, which mainly consisted of conversion errors.&#xd;
&#xd;
The lists include only those words, keywords or n-grams that were found in at least 8 different subjects. Keyword lists were extracted using the Sketch Engine tool, minimum frequency was set to 5, the statistics used was average relative frequency. Minimum frequency for n-grams was 10.</dc:description>
<dc:date>2019-03-08</dc:date>
<dc:type>lexicalConceptualResource</dc:type>
<dc:identifier>http://hdl.handle.net/11356/1215</dc:identifier>
<dc:language>slv</dc:language>
<dc:rights>Creative Commons - Attribution 4.0 International (CC BY 4.0)</dc:rights>
<dc:rights>https://creativecommons.org/licenses/by/4.0/</dc:rights>
<dc:rights>PUB</dc:rights>
<dc:format>application/zip</dc:format>
<dc:format>text/plain; charset=utf-8</dc:format>
<dc:format>downloadable_files_count: 1</dc:format>
<dc:publisher>Centre for Language Resources and Technologies, University of Ljubljana</dc:publisher>
</oai_dc:dc>
</metadata></record></GetRecord></OAI-PMH>