UK | FFUK | webmaster

intercorp - logo
ic_bibl.bib

ic_bibl.bib

@preamble{{\newcommand{\nic}[1]{}\frenchspacing\newcommand{\urls}[1]{{\small #1}}}}
@article{cermak:rosen:10,
  author = {Franti{\v s}ek {\v C}erm{\'a}k and Alexandr Rosen},
  date-modified = {2013-04-08 07:47:27 +0000},
  issn = {1384-6655},
  journal = {International Journal of Corpus Linguistics},
  keywords = {parallel corpora, comparative corpus linguistics, European languages, Czech, multilingualism},
  number = {3},
  pages = {411--427},
  title = {The Case of {I}nter{C}orp, a multilingual parallel corpus},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2012_intercorp_ijcl.pdf},
  volume = {13},
  year = {2012},
  abstract = {This paper introduces InterCorp, a parallel corpus including texts in Czech and 27 other languages, available for online searches via a web interface. After discussing some issues and merits of a multilingual resource we argue that it has an important role especially for languages with fewer native speakers, supporting both comparative research and studies of the language from the perspective of other languages. We proceed with an overview of the corpus --- the strategy and criteria for including new texts, the representation of available languages and text types, linguistic annotation, and a sketch of pre-processing issues. Finally, we present the search interface and suggest some research opportunities.},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2012_intercorp_lrej.pdf}
}
@inproceedings{Rosen:Vavrin:2012,
  address = {Istanbul, Turkey},
  author = {Alexandr Rosen and Martin Vav{\v r}{\'\i}n},
  booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)},
  date = {23-25},
  date-added = {2012-05-28 22:44:16 +0000},
  date-modified = {2013-04-28 20:44:21 +0000},
  editor = {Nicoletta Calzolari and Khalid Choukri and Thierry Declerck and Mehmet U{\u g}ur Do{\u g}an and Bente Maegaard and Joseph Mariani and Jan Odijk and Stelios Piperidis},
  isbn = {978-2-9517408-7-7},
  keywords = {parallel corpora; multilingual; Czech},
  language = {english},
  pages = {2447--2452},
  publisher = {European Language Resources Association (ELRA)},
  sponsor = {MSM0021620823},
  title = {Building a multilingual parallel corpus for human users},
  url = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/200.html},
  year = {2012},
  abstract = {We present the architecture and the current state of InterCorp, a multilingual parallel corpus centered around Czech, intended primarily for human users and consisting of written texts with a focus on fiction. Following an outline of its recent development and a comparison with some other multilingual parallel corpora we give an overview of the data collection procedure that covers text selection criteria, data format, conversion, alignment, lemmatization and tagging. Finally, we discuss challenges and prospects of the project.},
  bdsk-url-1 = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/200.html}
}
@incollection{Jirasek:2011,
  address = {Praha},
  author = {Karel Jir{\'a}sek},
  booktitle = {Korpusov{\'a} lingvistika Praha 2011: 1 -- InterCorp},
  date-added = {2012-11-02 23:01:27 +0000},
  date-modified = {2013-04-28 20:44:45 +0000},
  editor = {Franti{\v s}ek {\v C}erm{\'a}k},
  keywords = {parallel corpora, Croatian, lexicon},
  pages = {45--55},
  publisher = {Nakladatelstv{\'\i} Lidov{\'e} noviny},
  title = {Vyu{\v z}it{\'\i} paraleln{\'\i}ho korpusu {InterCorp} k z{\'\i}sk{\'a}v{\'a}n{\'\i} ekvivalent{\r u} pro chorvatsko-{\v c}esk{\'y} slovn{\'\i}k},
  year = {2011}
}
@incollection{Kana:2011,
  address = {Praha},
  author = {Tom{\'a}{\v s} Ka{\v n}a},
  booktitle = {Korpusov{\'a} lingvistika Praha 2011: 1 -- InterCorp},
  date-added = {2012-07-02 15:22:24 +0000},
  date-modified = {2012-07-02 15:24:12 +0000},
  editor = {Franti{\v s}ek {\v C}erm{\'a}k},
  organization = {{\'U}stav {\v C}esk{\'e}ho n{\'a}rodn{\'\i}ho korpusu},
  pages = {168--185},
  publisher = {Nakladatelstv{\'\i} Lidov{\'e} noviny},
  series = {Studie z korpusov{\'e} lingvistiky},
  title = {Deminutiva a deminutivn{\'\i} vyj{\'a}d{\v r}en{\'\i} v {\v c}e{\v s}tin{\v e}, n{\v e}m{\v c}in{\v e} a angli{\v c}tin{\v e} -- hled{\'a}n{\'\i} hranic},
  volume = {14},
  year = {2011}
}
@inproceedings{Kren:etal:2011,
  address = {Praha},
  author = {Michal K{\v r}en and Alexandr Rosen and Michal {\v S}toura{\v c} and Martin Vav{\v r}{\'\i}n and Pavel Vond{\v r}i{\v c}ka},
  booktitle = {Korpusov{\'a} lingvistika Praha 2011: 2 - V{\'y}zkum a v{\'y}stavba korpus{\r u}},
  date-added = {2011-10-09 16:11:10 +0200},
  date-modified = {2013-04-23 21:44:11 +0000},
  editor = {Franti{\v s}ek {\v C}erm{\'a}k},
  organization = {{\'U}stav {\v C}esk{\'e}ho n{\'a}rodn{\'\i}ho korpusu},
  pages = {105-115},
  series = {Studie z korpusov{\'e} lingvistiky},
  title = {Paraleln{\'\i} korpus {I}nter{C}orp po sedmi letech [{T}he parallel corpus {I}nter{C}orp after seven years]},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2011_intercorp_korpling.pdf},
  volume = {15},
  volumetitle = {2 - V{\'y}zkum a v{\'y}stavba korpus{\r u}},
  year = {2011},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2011_intercorp_korpling.pdf}
}
@book{korpling1:2011,
  address = {Praha},
  booktitle = {Korpusov{\'a} lingvistika Praha 2011: 1 -- InterCorp},
  date-added = {2011-10-09 17:36:30 +0200},
  date-modified = {2012-07-02 15:18:45 +0000},
  editor = {Franti{\v s}ek {\v C}erm{\'a}k},
  organization = {{\'U}stav {\v C}esk{\'e}ho n{\'a}rodn{\'\i}ho korpusu},
  publisher = {Nakladatelstv{\'\i} Lidov{\'e} noviny},
  series = {Studie z korpusov{\'e} lingvistiky},
  title = {Korpusov{\'a} lingvistika Praha 2011: 1 -- InterCorp},
  volume = {14},
  year = {2011}
}
@incollection{Cermakova:Farova:2010,
  address = {Praha},
  author = {Anna {\v C}erm{\'a}kov{\'a} and Lenka F{\'a}rov{\'a}},
  booktitle = {InterCorp: Exploring a Multilingual Corpus},
  date-added = {2012-07-02 15:08:33 +0000},
  date-modified = {2012-07-02 15:11:01 +0000},
  editor = {Franti{\v s}ek {\v C}erm{\'a}k and Ale{\v s} Kl{\'e}gr and Patrick Corness},
  isbn = {978-80-7422-042-5},
  pages = {177--188},
  publisher = {Nakladatelstv{\'\i} Lidov{\'e} noviny},
  series = {Studie z korpusov{\'e} lingvistiky},
  title = {Keywords in {Harry Potter} and their {C}zech and {F}innish Translation Equivalents},
  volume = {13},
  year = {2010},
  abstract = {Tagsets, used to annotate corpora, often classify word classes and morphological categories   according to different criteria, even within a single language. Then it can be dfficult to identify corresponding morphosyntactic categories in texts tagged by different schemata. In tagset A, cardinal and ordinal numerals may belong to the class of numerals, while personal and possessive pronouns to the class of pronouns. In tagset B, on the other hand, cardinal numerals and personal pronouns may belong to the class of nouns, while ordinal numerals and possessive pronouns to the class of adjectives.
Texts tagged in such disparate ways make searching and automatic processing harder. For a parallel corpus such as InterCorp (http://korpus.cz/intercorp-info.php), currently including 25 languages, a single "harmonized" tagset could be designed (similarly as in the project MULTEXT-East), or - even better - to encode the information from all tagsets into a morphosyntactic "interlingua" (see Dan Zeman's Interset). The parallel with natural languages is appropriate: problems with missing equivalents occur in the translation of words as well as tags. Thus we propose a tagset interlingua as a hierarchy (lattice) of categories, corrosponding to language-specific tags. A missing tag in a language can be substituted by a more general tag or a by a disjunction of more specific tags. Similarly as with multilingual lexical databases the methods of Formal Concept Analysis can be used.},
  annote = {
},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/unitags_pap.pdf}
}
@incollection{ros:10a,
  abstract_cs = {Sady morfosyntaktick{\'y}ch zna{\v c}ek, kter{\'e} se pou{\v z}{\'\i}vaj{\'\i} p{\v r}i anotaci korpusu, {\v c}asto t{\v r}{\'\i}d{\'\i} slovn{\'\i} druhy a morfologick{\'e} kategorie na z{\'a}klad{\v e} odli{\v s}n{\'y}ch krit{\'e}ri{\'\i}, a to i v r{\'a}mci jednoho jazyka. Identifikace odpov{\'\i}daj{\'\i}c{\'\i}ch morfosyntaktick{\'y}ch kategori{\'\i} v textech ozna{\v c}kovan{\'y}ch podle odli{\v s}n{\'y}ch sch{\'e}mat pak m{\r u}{\v z}e b{\'y}t obt{\'\i}{\v z}n{\'a}. Nap{\v r}. z{\'a}kladn{\'\i} {\v c}{\'\i}slovky spolu s {\v r}adov{\'y}mi mohou v tagsetu A pat{\v r}it do t{\v r}{\'\i}dy {\v c}{\'\i}slovek, zat{\'\i}mco osobn{\'\i} z{\'a}jmena spolu s p{\v r}ivlast{\v n}ovac{\'\i}mi do t{\v r}{\'\i}dy z{\'a}jmen. Oproti tomu v tagsetu B mohou z{\'a}kladn{\'\i} {\v c}{\'\i}slovky a osobn{\'\i} z{\'a}jmena pat{\v r}it do stejn{\'e} t{\v r}{\'\i}dy spolu se substantivy, pri{\v c}em{\v z} {\v c}{\'\i}slovky {\v r}adov{\'e} a p{\v r}ivlast{\v n}ovaci z{\'a}jmena do stejn{\'e} t{\v r}{\'\i}dy spolu s adjektivy. Takto dispar{\'a}tn{\v e} ozna{\v c}kovan{\'e} texty mohou komplikovat automatick{\'e} zpracov{\'a}n{\'\i} i hled{\'a}n{\'\i} v korpusu. Pro {\'u}{\v c}ely paraleln{\'\i}ho korpusu InterCorp (http://korpus.cz/intercorp-info.php), kter{\'y} toho {\v c}asu obsahuje 25 jazyk{\r u}, by bylo mo{\v z}n{\'e} navrhnout jeden "harmonizovany" tagset (podobn{\v e} jako v projektu MULTEXT-East), nebo -- je{\v s}t{\v e} l{\'e}pe -- zak{\'o}dovat informace ze v{\v s}ech tagset{\r u} do morfosyntaktick{\'e} "interlingvy" (viz Interset Dana Zemana). Paralela s p{\v r}irozen{\'y}mi jazyky je zde na m{\'\i}st{\v e}: probl{\'e}my s chyb{\v e}j{\'\i}c{\'\i}mi ekvivalenty vznikaji p{\v r}i p{\v r}ekladu zna{\v c}ek i slov. Proto navrhujeme tagsetovou interlingvu jako hierarchii (svaz) kategori{\'\i}, kter{\'e} odpov{\'\i}daj{\'\i} zna{\v c}k{\'a}m v jednotliv{\'y}ch jazyc{\'\i}ch/tagsetech. Pokud pro danou kategorii v n{\v e}kter{\'e}m jazyce chyb{\'\i} zna{\v c}ka, je mo{\v z}n{\'e} ji nahradit zna{\v c}kou obecn{\v e}j{\v s}{\'\i} nebo disjunkc{\'\i} zna{\v c}ek specifi{\v c}t{\v e}j{\v s}{\'\i}ch. Podobn{\v e} jako p{\v r}i konstrukci v{\'\i}cejazy{\v c}n{\'e} lexik{\'a}ln{\'\i} datab{\'a}ze lze i pro hierarchii morfosyntaktick{\'y}ch kategori{\'\i} vyu{\v z}{\'\i}t metody form{\'a}ln{\'\i} konceptu{\'a}ln{\'\i} anal{\'y}zy.},
  address = {Praha},
  author = {Alexandr Rosen},
  booktitle = {InterCorp: Exploring a Multilingual Corpus},
  date-modified = {2011-11-20 14:13:33 +0000},
  editor = {Franti{\v s}ek {\v C}erm{\'a}k and Ale{\v s} Kl{\'e}gr and Patrick Corness},
  isbn = {978-80-7422-042-5},
  pages = {205--234},
  publisher = {Nakladatelstv{\'\i} Lidov{\'e} noviny},
  series = {Studie z korpusov{\'e} lingvistiky},
  title = {Morphological Tags in Parallel Corpora},
  title_cs = {Morfologick{\'e} zna{\v c}ky v paraleln{\'\i}ch korpusech},
  url = {http://utkl.ff.cuni.cz/~rosen/public/unitags_pap.pdf},
  volume = {13},
  year = {2010},
  abstract = {Tagsets, used to annotate corpora, often classify word classes and morphological categories   according to different criteria, even within a single language. Then it can be dfficult to identify corresponding morphosyntactic categories in texts tagged by different schemata. In tagset A, cardinal and ordinal numerals may belong to the class of numerals, while personal and possessive pronouns to the class of pronouns. In tagset B, on the other hand, cardinal numerals and personal pronouns may belong to the class of nouns, while ordinal numerals and possessive pronouns to the class of adjectives.
Texts tagged in such disparate ways make searching and automatic processing harder. For a parallel corpus such as InterCorp (http://korpus.cz/intercorp-info.php), currently including 25 languages, a single "harmonized" tagset could be designed (similarly as in the project MULTEXT-East), or - even better - to encode the information from all tagsets into a morphosyntactic "interlingua" (see Dan Zeman's Interset). The parallel with natural languages is appropriate: problems with missing equivalents occur in the translation of words as well as tags. Thus we propose a tagset interlingua as a hierarchy (lattice) of categories, corrosponding to language-specific tags. A missing tag in a language can be substituted by a more general tag or a by a disjunction of more specific tags. Similarly as with multilingual lexical databases the methods of Formal Concept Analysis can be used.},
  annote = {
},
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/unitags_pap.pdf}
}
@inproceedings{Vondricka:10,
  author = {Pavel Vond{\v r}i{\v c}ka},
  booktitle = {Mnohojazy{\v c}n{\'y} korpus InterCorp: Mo{\v z}nosti studia [Multilingual Corpus InterCorp: Research Options]},
  date-modified = {2012-03-21 12:53:51 +0000},
  editor = {Franti{\v s}ek {\v C}erm{\'a}k and Jan Kocek},
  pages = {225-231},
  publisher = {Nakladatelstv{\'\i} Lidov{\'e} noviny},
  title = {{TCA}2 -- n{\'a}stroj pro zpracov{\'a}v{\'a}n{\'\i} p{\v r}ekladov{\'y}ch korpus{\r u} [{TCA}2 -- a tool for processing translation corpora]},
  year = 2010
}
@book{Cermak:Kocek:2010,
  address = {Praha},
  booktitle = {Mnohojazy{\v c}n{\'y} korpus InterCorp: Mo{\v z}nosti studia},
  date-added = {2013-04-28 20:55:19 +0000},
  date-modified = {2013-04-28 20:55:38 +0000},
  editor = {Franti{\v s}ek {\v C}erm{\'a}k and Jan Kocek},
  publisher = {Nakladatelstv{\'\i} Lidov{\'e} noviny},
  title = {Mnohojazy{\v c}n{\'y} korpus InterCorp: Mo{\v z}nosti studia},
  year = {2010},
  abstract = {Tento svazek je prvn{\'\i}m v{\'y}stupem z mnohojazy{\v c}n{\'e}ho paraleln{\'\i}ho korpusu p{\v r}in{\'a}{\v s}ej{\'\i}c{\'\i}m studie a  srovn{\'a}vac{\'\i} anal{\'y}zy celkem 13 jazyk{\r u} (v{\r u}{\v c}i {\v c}e{\v s}tin{\v e}) v oblasti lexikonu, gramatiky, slovotvorby, frazeologie, syntaxe a dal{\v s}{\'\i}ch.}
}
@book{Cermak:etal:2010,
  address = {Praha},
  booktitle = {InterCorp: Exploring a Multilingual Corpus},
  date-added = {2013-04-28 20:52:51 +0000},
  date-modified = {2013-04-28 20:53:26 +0000},
  editor = {Franti{\v s}ek {\v C}erm{\'a}k and Patrick Corness and Ale{\v s} Kl{\'e}gr},
  publisher = {Nakladatelstv{\'\i} Lidov{\'e} noviny},
  title = {InterCorp: Exploring a Multilingual Corpus},
  year = {2010},
  abstract = {Exploration of grammar, lexis, translations, applications, and methodological issues are studied and illustrated on language pairs or on a group of more languages. This is supplemented by broad and general contributions delineating the field of comparative multilingual corpus linguistics showing possible directions of comparative research based on a multilingual parallel corpus.}
}
@inproceedings{VavRos:08,
  author = {Martin Vav{\v r}{\'\i}n and Alexandr Rosen},
  booktitle = {Proceedings of the International Conference Corpus Linguistics -- 2008},
  date-modified = {2011-10-09 18:00:33 +0200},
  isbn = {978-5-288-04769-5},
  language = {English},
  location = {St. Petersburg},
  pages = {97-104},
  publisher = {St. Petersburg State University},
  title = {{InterCorp: A Multilingual Parallel Corpus Project}},
  url = {http://utkl.ff.cuni.cz/~rosen/public/2008_intercorp_peterburg.pdf},
  year = 2008,
  bdsk-url-1 = {http://utkl.ff.cuni.cz/~rosen/public/2008_intercorp_peterburg.pdf}
}

This file was generated by bibtex2html 1.96.