[{"data":1,"prerenderedAt":-1},["ShallowReactive",2],{"sidebar-data":3,"breadcrumb-conf-2018":849,"conference-2018":850,"papers-2018":851},{"conferences":4,"tutorials":254,"workshops":260},[5,27,47,65,79,96,113,130,147,164,180,195,211,225,240],{"conference_id":6,"year":7,"proceedings_title":8,"venue_ids":9,"isbn":10,"issn":11,"doi":12,"publisher":13,"editors":14,"conference_name":15,"conference_acronym":16,"conference_number":17,"conference_location":18,"conference_city":19,"conference_country":20,"conference_start_date":21,"conference_end_date":22,"conference_url":23,"pdf_url":24,"img_conf_url":25,"paperCount":26},"lrec2026","2026","Proceedings of the Fifteenth Language Resources and Evaluation Conference (LREC 2026)","lrec","978-2-493814-49-4","2522-2686","10.63317\u002F4fxzgre27xzj","European Language Resources Association (ELRA)","Stelios Piperidis, Núria Bel, Henk van den Heuvel, Nancy Ide, Simon Krek, Antonio Toral","The Fifteenth Language Resources and Evaluation Conference (LREC 2026)","LREC","15","Palau de Congressos de Palma","Palma, Mallorca","Spain","2026-05-11","2026-05-16","https:\u002F\u002Flrec2026.info","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2026\u002FLREC-2026.pdf",null,944,{"conference_id":28,"year":29,"proceedings_title":30,"venue_ids":31,"isbn":32,"issn":11,"doi":33,"publisher":34,"editors":35,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_city":40,"conference_country":41,"conference_start_date":42,"conference_end_date":43,"conference_url":44,"pdf_url":45,"img_conf_url":25,"paperCount":46},"lrec2024","2024","Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)","lrec|coling","979-10-95546-34-4","10.63317\u002F375ba8vd9q2v","European Language Resources Association (ELRA) and ICCL","Nicoletta Calzolari, Min-Yen Kan, Veronique Hoste, Alessandro Lenci, Sakriani Sakti, Nianwen Xue","Joint International Conference on Computational Linguistics, Language Resources and Evaluation","LREC-COLING","14","Lingotto Conference Centre","Turin","Italy","2024-05-20","2024-05-25","https:\u002F\u002Flrec-coling-2024.org","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002FLREC-2024.pdf",1554,{"conference_id":48,"year":49,"proceedings_title":50,"venue_ids":9,"isbn":51,"issn":11,"doi":52,"publisher":13,"editors":53,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_city":57,"conference_country":58,"conference_start_date":59,"conference_end_date":60,"conference_url":61,"pdf_url":62,"img_conf_url":63,"paperCount":64},"lrec2022","2022","Proceedings of the Thirteenth International Conference on Language Resources and Evaluation (LREC 2022)","79-10-95546-38-2","10.63317\u002F296vkvmh42ye","Nicoletta Calzolari, Frédéric Béchet, Philippe Blache, Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Jan Odijk, Stelios Piperidis2020","Thirteenth Language Resources and Evaluation Conference","13","Palais du Pharo","Marseille","France","2022-06-20","2022-06-25","https:\u002F\u002Flrec2022.lrec-conf.org\u002Fen\u002F","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2022\u002FLREC-2022.pdf","",804,{"conference_id":66,"year":67,"proceedings_title":68,"venue_ids":9,"isbn":69,"issn":11,"doi":70,"publisher":13,"editors":71,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_city":57,"conference_country":58,"conference_start_date":74,"conference_end_date":75,"conference_url":76,"pdf_url":77,"img_conf_url":63,"paperCount":78},"lrec2020","2020","Proceedings of the Twelfth International Conference on Language Resources and Evaluation (LREC 2020)","79-10-95546-34-4","10.63317\u002F4j46u44gnpwr","Nicoletta Calzolari, Frédéric Béchet, Philippe Blache, Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis","Twelfth Language Resources and Evaluation Conference","12","2020-05-11","2020-05-16","https:\u002F\u002Flrec2020.lrec-conf.org\u002Fen\u002Findex.html","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002FLREC-2020.pdf",895,{"conference_id":80,"year":81,"proceedings_title":82,"venue_ids":9,"isbn":83,"issn":11,"doi":84,"publisher":13,"editors":85,"conference_name":86,"conference_acronym":16,"conference_number":87,"conference_location":88,"conference_city":89,"conference_country":90,"conference_start_date":91,"conference_end_date":92,"conference_url":93,"pdf_url":94,"img_conf_url":63,"paperCount":95},"lrec2018","2018","Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018)","79-10-95546-00-9","10.63317\u002F25jzjyk647iz","Nicoletta Calzolari, Khalid Choukri, Christopher Cieri, Thierry Declerck, Sara Goggi, Koiti Hasida, Hitoshi Isahara, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asuncion Moreno, Jan Odijk, Stelios Piperidis, Takenobu Tokunaga","Eleventh International Conference on Language Resources and Evaluation","11","Phoenix Seagaia Resort","Miyazaki","Japan","2018-05-07","2018-05-12","http:\u002F\u002Flrec2018.lrec-conf.org\u002Fen\u002F","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002FLREC2018_Proceedings.zip",728,{"conference_id":97,"year":98,"proceedings_title":99,"venue_ids":9,"isbn":100,"issn":11,"doi":101,"publisher":13,"editors":102,"conference_name":103,"conference_acronym":16,"conference_number":104,"conference_location":105,"conference_city":106,"conference_country":107,"conference_start_date":108,"conference_end_date":109,"conference_url":110,"pdf_url":111,"img_conf_url":63,"paperCount":112},"lrec2016","2016","Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)","978-2-9517408-9-1","10.63317\u002F5mruwrazrwbg","Nicoletta Calzolari, Khalid Choukri, Thierry Declerck, Sara Goggi, Marko Grobelnik, Bente Maegaard, Joseph Mariani, Hélène Mazo, Asunción Moreno, Jan Odijk, Stelios Piperidis","Tenth International Conference on Language Resources and Evaluation","10","Bernardinsko Naselje","Portorož","Slovenia","2016-05-23","2016-05-28","http:\u002F\u002Flrec2016.lrec-conf.org\u002Fen\u002F","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2016\u002FLREC2016_Proceedings.zip",745,{"conference_id":114,"year":115,"proceedings_title":116,"venue_ids":9,"isbn":117,"issn":11,"doi":118,"publisher":13,"editors":119,"conference_name":120,"conference_acronym":16,"conference_number":121,"conference_location":122,"conference_city":123,"conference_country":124,"conference_start_date":125,"conference_end_date":126,"conference_url":127,"pdf_url":128,"img_conf_url":63,"paperCount":129},"lrec2014","2014","Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC 2014)","978-2-9517408-8-4","10.63317\u002F3ebxpiqq4ikp","Nicoletta Calzolari, Khalid Choukri, Thierry Declerck, Hrafn Loftsson, Bente Maegaard, Joseph Mariani, Asuncion Moreno, Jan Odijk, Stelios Piperidis","Ninth International Conference on Language Resources and Evaluation","9","Harpa Concert Hall and Conference Centre","Reykjavik","Iceland","2014-05-26","2014-05-31","http:\u002F\u002Flrec2014.lrec-conf.org\u002Fen\u002F","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2014\u002FLREC2014_Proceedings.zip",746,{"conference_id":131,"year":132,"proceedings_title":133,"venue_ids":9,"isbn":134,"issn":11,"doi":135,"publisher":13,"editors":136,"conference_name":137,"conference_acronym":16,"conference_number":138,"conference_location":139,"conference_city":140,"conference_country":141,"conference_start_date":142,"conference_end_date":143,"conference_url":144,"pdf_url":145,"img_conf_url":63,"paperCount":146},"lrec2012","2012","Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012)","978-2-9517408-7-7","10.63317\u002F42za3jv29xvs","Nicoletta Calzolari, Khalid Choukri, Thierry Declerck, Mehmet Doğan, Bente Maegaard, Joseph Mariani, Asuncion Moreno, Jan Odijk, Stelios Piperidis","Eighth International Conference on Language Resources and Evaluation","8","Istanbul Convention & Exhibition Centre (ICEC) (Lütfi Kırdar)","Istanbul","Turkey","2012-05-21","2012-05-27","http:\u002F\u002Fwww.lrec-conf.org\u002Flrec2012\u002F","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2012\u002FLREC2012_Proceedings.zip",670,{"conference_id":148,"year":149,"proceedings_title":150,"venue_ids":9,"isbn":151,"issn":11,"doi":152,"publisher":13,"editors":153,"conference_name":154,"conference_acronym":16,"conference_number":155,"conference_location":156,"conference_city":157,"conference_country":158,"conference_start_date":159,"conference_end_date":160,"conference_url":161,"pdf_url":162,"img_conf_url":63,"paperCount":163},"lrec2010","2010","Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC 2010)","2-9517408-6-7","10.63317\u002F32m6vov78mmv","Nicoletta Calzolari, Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, Mike Rosner, Daniel Tapias","Seventh International Conference on Language Resources and Evaluation","7","Mediterranean Conference Centre (MCC)","Valletta","Malta","2010-05-17","2010-05-23","http:\u002F\u002Fwww.lrec-conf.org\u002Flrec2010\u002F","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2010\u002FLREC2010_Proceedings.zip",645,{"conference_id":165,"year":166,"proceedings_title":167,"venue_ids":9,"isbn":168,"issn":11,"doi":169,"publisher":13,"editors":170,"conference_name":171,"conference_acronym":16,"conference_number":172,"conference_location":173,"conference_city":174,"conference_country":175,"conference_start_date":176,"conference_end_date":177,"conference_url":178,"pdf_url":63,"img_conf_url":63,"paperCount":179},"lrec2008","2008","Proceedings of the Sixth International Conference on Language Resources and Evaluation (LREC 2008)","2-9517408-4-0","10.63317\u002F3c6xa89msnta","Nicoletta Calzolari, Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odijk, Stelios Piperidis, Daniel Tapias","Sixth International Conference on Language Resources and Evaluation","6","Palais des Congrès","Marrakech","Morocco","2008-05-28","2008-05-30","http:\u002F\u002Fwww.lrec-conf.org\u002Flrec2008\u002F",620,{"conference_id":181,"year":182,"proceedings_title":183,"venue_ids":9,"isbn":184,"issn":11,"doi":185,"publisher":13,"editors":186,"conference_name":187,"conference_acronym":16,"conference_number":188,"conference_location":189,"conference_city":190,"conference_country":41,"conference_start_date":191,"conference_end_date":192,"conference_url":193,"pdf_url":63,"img_conf_url":63,"paperCount":194},"lrec2006","2006","Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC 2006)","2-9517408-2-4","10.63317\u002F2xx3x75ppppa","Nicoletta Calzolari, Khalid Choukri, Aldo Gangemi, Bente Maegaard, Joseph Mariani, Jan Odijk, Daniel Tapias","Fifth International Conference on Language Resources and Evaluation","5","Magazzini del Cotone","Genoa","2006-05-24","2006-05-26","http:\u002F\u002Fwww.lrec-conf.org\u002Flrec2006\u002F",513,{"conference_id":196,"year":197,"proceedings_title":198,"venue_ids":9,"isbn":199,"issn":11,"doi":200,"publisher":13,"editors":201,"conference_name":202,"conference_acronym":16,"conference_number":203,"conference_location":204,"conference_city":205,"conference_country":206,"conference_start_date":207,"conference_end_date":208,"conference_url":209,"pdf_url":63,"img_conf_url":63,"paperCount":210},"lrec2004","2004","Proceedings of the Fourth International Conference on Language Resources and Evaluation (LREC 2004)","2-9517408-1-6","10.63317\u002F2s47745g6zhw","Maria Teresa Lino, Maria Francisca Xavier, Fatima Ferreira, Rute Costa, Raquel Silva","Fourth International Conference on Language Resources and Evaluation","4","Centro Cultural de Belém","Lisbon","Portugal","2004-05-26","2004-05-28","http:\u002F\u002Fwww.lrec-conf.org\u002Flrec2004\u002F",524,{"conference_id":212,"year":213,"proceedings_title":214,"venue_ids":9,"isbn":63,"issn":11,"doi":215,"publisher":13,"editors":216,"conference_name":217,"conference_acronym":16,"conference_number":218,"conference_location":219,"conference_city":220,"conference_country":20,"conference_start_date":221,"conference_end_date":222,"conference_url":223,"pdf_url":63,"img_conf_url":63,"paperCount":224},"lrec2002","2002","Proceedings of the Third International Conference on Language Resources and Evaluation (LREC 2002)","10.63317\u002F3ha6dpna2o97","Manuel González Rodríguez, Carmen Paz Suarez Araujo","Third International Conference on Language Resources and Evaluation","3","Auditorio Alfredo Kraus","Las Palmas","2002-05-29","2002-05-31","http:\u002F\u002Fwww.lrec-conf.org\u002Flrec2002\u002F",354,{"conference_id":226,"year":227,"proceedings_title":228,"venue_ids":9,"isbn":63,"issn":11,"doi":229,"publisher":13,"editors":230,"conference_name":231,"conference_acronym":16,"conference_number":232,"conference_location":233,"conference_city":234,"conference_country":235,"conference_start_date":236,"conference_end_date":237,"conference_url":238,"pdf_url":63,"img_conf_url":63,"paperCount":239},"lrec2000","2000","Proceedings of the Second International Conference on Language Resources and Evaluation (LREC 2000)","10.63317\u002F3yosukd7w6sn","Maria Gavrilidou, George Carayannis, Stella Markantonatou, Stelios Piperidis, Greg Stainhauer","Second International Conference on Language Resources and Evaluation","2","Zappeion Megaron","Athens","Greece","2000-05-31","2000-06-02","http:\u002F\u002Fwww.lrec-conf.org\u002Flrec2000\u002F",280,{"conference_id":241,"year":242,"proceedings_title":243,"venue_ids":9,"isbn":63,"issn":11,"doi":244,"publisher":13,"editors":245,"conference_name":246,"conference_acronym":16,"conference_number":247,"conference_location":248,"conference_city":249,"conference_country":20,"conference_start_date":250,"conference_end_date":251,"conference_url":252,"pdf_url":63,"img_conf_url":63,"paperCount":253},"lrec1998","1998","Proceedings of the First International Conference on Language Resources and Evaluation (LREC 1998)","10.63317\u002F5a986fnjefzm","Antonio Rubio,Natividad Gallardo, Rosa Castro, Antonio Tejada","Language Resources and Evaluation Conference","1","Palacio de Congresos de Granada","Granada","1998-05-28","1998-05-30","http:\u002F\u002Fwww.lrec-conf.org\u002Flrec1998\u002F",212,[255],{"year":29,"proceedings_title":256,"paperCount":257,"doi":258,"pdf_url":259,"venue_ids":31,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024): Tutorial Summaries",13,"10.63317\u002F3piy8jnqffp3","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Ftutorials\u002FLREC-2024-Tutorials.pdf",{"2020":261,"2022":452,"2024":617},[262,269,276,282,289,296,302,308,315,322,328,335,341,348,354,359,364,370,376,382,387,392,397,402,407,413,419,425,431,436,442,447],{"workshop_id":263,"year":67,"full_workshop_id":264,"proceedings_title":265,"paperCount":266,"doi":267,"pdf_url":268,"venue_ids":263,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"aespen","lrec2020_ws_aespen","Proceedings of the Workshop on Automated Extraction of Socio-political Events from News 2020",11,"10.63317\u002F58onsa8rnrrz","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FAESPEN2020\u002FAESPEN-2020.pdf",{"workshop_id":270,"year":67,"full_workshop_id":271,"proceedings_title":272,"paperCount":273,"doi":274,"pdf_url":275,"venue_ids":270,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"ai4hi","lrec2020_ws_ai4hi","Proceedings of the 1st International Workshop on Artificial Intelligence for Historical Image Enrichment and Access",5,"10.63317\u002F3m5ep69cw7jj","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FAI4HI2020\u002FAI4HI-2020.pdf",{"workshop_id":277,"year":67,"full_workshop_id":278,"proceedings_title":279,"paperCount":266,"doi":280,"pdf_url":281,"venue_ids":277,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"bucc","lrec2020_ws_bucc","Proceedings of the 13th Workshop on Building and Using Comparable Corpora","10.63317\u002F2fx83jms4c9r","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FBUCC2020\u002FBUCC-2020.pdf",{"workshop_id":283,"year":67,"full_workshop_id":284,"proceedings_title":285,"paperCount":286,"doi":287,"pdf_url":288,"venue_ids":283,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"calcs","lrec2020_ws_calcs","Proceedings of the 4th Workshop on Computational Approaches to Code Switching",9,"10.63317\u002F3jbxrkvj6qkv","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FCS2020\u002FCALCS-2020.pdf",{"workshop_id":290,"year":67,"full_workshop_id":291,"proceedings_title":292,"paperCount":293,"doi":294,"pdf_url":295,"venue_ids":290,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"cllrd","lrec2020_ws_cllrd","Proceedings of the LREC 2020 Workshop on \"Citizen Linguistics in Language Resource Development\"",8,"10.63317\u002F3qo9e6q6vq5f","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002Fcllrd2020\u002FCLLRD-2020.pdf",{"workshop_id":297,"year":67,"full_workshop_id":298,"proceedings_title":299,"paperCount":266,"doi":300,"pdf_url":301,"venue_ids":297,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"clssts","lrec2020_ws_clssts","Proceedings of the workshop on Cross-Language Search and Summarization of Text and Speech (CLSSTS2020)","10.63317\u002F3p6twmv6mhc5","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FCLSSTS2020\u002FCLSSTS-2020.pdf",{"workshop_id":303,"year":67,"full_workshop_id":304,"proceedings_title":305,"paperCount":286,"doi":306,"pdf_url":307,"venue_ids":303,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"cmlc","lrec2020_ws_cmlc","Proceedings of the 8th Workshop on Challenges in the Management of Large Corpora","10.63317\u002F236pt6g4g4s4","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FCMLC-8\u002FCMLC-2020.pdf",{"workshop_id":309,"year":67,"full_workshop_id":310,"proceedings_title":311,"paperCount":312,"doi":313,"pdf_url":314,"venue_ids":309,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"computerm","lrec2020_ws_computerm","Proceedings of the 6th International Workshop on Computational Terminology",15,"10.63317\u002F4jp43md9xe2q","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FCOMPUTERM2020\u002FCOMPUTERM-2020.pdf",{"workshop_id":316,"year":67,"full_workshop_id":317,"proceedings_title":318,"paperCount":319,"doi":320,"pdf_url":321,"venue_ids":316,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"framenet","lrec2020_ws_framenet","Proceedings of the International FrameNet Workshop 2020: Towards a Global, Multilingual FrameNet",12,"10.63317\u002F4tjynpg2ohf3","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002Fframenet2020\u002FFrameNet-2020.pdf",{"workshop_id":323,"year":67,"full_workshop_id":324,"proceedings_title":325,"paperCount":319,"doi":326,"pdf_url":327,"venue_ids":323,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"gamnlp","lrec2020_ws_gamnlp","Proceedings of the Workshop on Games and Natural Language Processing","10.63317\u002F5ahttrxdfnza","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FGames-NLP\u002FGAMNLP-2020.pdf",{"workshop_id":329,"year":67,"full_workshop_id":330,"proceedings_title":331,"paperCount":332,"doi":333,"pdf_url":334,"venue_ids":329,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"globalex","lrec2020_ws_globalex","Proceedings of the 2020 Globalex Workshop on Linked Lexicography",18,"10.63317\u002F34yjjfrnwvj8","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FGLOBALEX2020\u002FGLOBALEX-2020.pdf",{"workshop_id":336,"year":67,"full_workshop_id":337,"proceedings_title":338,"paperCount":319,"doi":339,"pdf_url":340,"venue_ids":336,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"isa","lrec2020_ws_isa","Proceedings of the 16th Joint ACL-ISO Workshop on Interoperable Semantic Annotation","10.63317\u002F5id7rv8izjcd","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FISA16\u002FISA-2020.pdf",{"workshop_id":342,"year":67,"full_workshop_id":343,"proceedings_title":344,"paperCount":345,"doi":346,"pdf_url":347,"venue_ids":342,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"iwltp","lrec2020_ws_iwltp","Proceedings of the 1st International Workshop on Language Technology Platforms",17,"10.63317\u002F4hc34do825yz","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FIWLTP2020\u002FIWLTP-2020.pdf",{"workshop_id":349,"year":67,"full_workshop_id":350,"proceedings_title":351,"paperCount":319,"doi":352,"pdf_url":353,"venue_ids":349,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"ldl","lrec2020_ws_ldl","Proceedings of the 7th Workshop on Linked Data in Linguistics (LDL-2020)","10.63317\u002F3mn9ttzvdbxs","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2020\u002Fworkshops\u002FLDL2020\u002FLDL-2020.pdf",{"workshop_id":355,"year":67,"full_workshop_id":356,"proceedings_title":357,"paperCount":293,"doi":358,"pdf_url":63,"venue_ids":355,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"lincr","lrec2020_ws_lincr","Proceedings of the Second Workshop on Linguistic and Neurocognitive Resources","10.63317\u002F24gnv8q9cz94",{"workshop_id":360,"year":67,"full_workshop_id":361,"proceedings_title":362,"paperCount":286,"doi":363,"pdf_url":63,"venue_ids":360,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"lr4sshoc","lrec2020_ws_lr4sshoc","Proceedings of the Workshop about Language Resources for the SSH Cloud","10.63317\u002F5j7vesdm7yia",{"workshop_id":365,"year":67,"full_workshop_id":366,"proceedings_title":367,"paperCount":368,"doi":369,"pdf_url":63,"venue_ids":365,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"lt4gov","lrec2020_ws_lt4gov","Proceedings of the 1st Workshop on Language Technologies for Government and Public Administration (LT4Gov)",6,"10.63317\u002F5i8su82ish3i",{"workshop_id":371,"year":67,"full_workshop_id":372,"proceedings_title":373,"paperCount":374,"doi":375,"pdf_url":63,"venue_ids":371,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"lt4hala","lrec2020_ws_lt4hala","Proceedings of LT4HALA 2020 - 1st Workshop on Language Technologies for Historical and Ancient Languages",21,"10.63317\u002F4jnfg39ctsra",{"workshop_id":377,"year":67,"full_workshop_id":378,"proceedings_title":379,"paperCount":380,"doi":381,"pdf_url":63,"venue_ids":377,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"mmw","lrec2020_ws_mmw","Proceedings of the LREC 2020 Workshop on Multimodal Wordnets (MMW2020)",7,"10.63317\u002F5pcp4c88d6n8",{"workshop_id":383,"year":67,"full_workshop_id":384,"proceedings_title":385,"paperCount":368,"doi":386,"pdf_url":63,"venue_ids":383,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"multilingualbio","lrec2020_ws_multilingualbio","Proceedings of the LREC 2020 Workshop on Multilingual Biomedical Text Processing (MultilingualBIO 2020)","10.63317\u002F4pfckaywoxxa",{"workshop_id":388,"year":67,"full_workshop_id":389,"proceedings_title":390,"paperCount":273,"doi":391,"pdf_url":63,"venue_ids":388,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"onion","lrec2020_ws_onion","Proceedings of LREC2020 Workshop \"People in language, vision and the mind\" (ONION2020)","10.63317\u002F2oxdsr8tue27",{"workshop_id":393,"year":67,"full_workshop_id":394,"proceedings_title":395,"paperCount":332,"doi":396,"pdf_url":63,"venue_ids":393,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"osact","lrec2020_ws_osact","Proceedings of the 4th Workshop on Open-Source Arabic Corpora and Processing Tools, with a Shared Task on Offensive Language Detection","10.63317\u002F2xjmcg9vsxcp",{"workshop_id":398,"year":67,"full_workshop_id":399,"proceedings_title":400,"paperCount":257,"doi":401,"pdf_url":63,"venue_ids":398,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"parlaclarin","lrec2020_ws_parlaclarin","Proceedings of the Second ParlaCLARIN Workshop","10.63317\u002F3qhkh6dmemmn",{"workshop_id":403,"year":67,"full_workshop_id":404,"proceedings_title":405,"paperCount":286,"doi":406,"pdf_url":63,"venue_ids":403,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"rail","lrec2020_ws_rail","Proceedings of the first workshop on Resources for African Indigenous Languages","10.63317\u002F3fjhbkudhcmc",{"workshop_id":408,"year":67,"full_workshop_id":409,"proceedings_title":410,"paperCount":411,"doi":412,"pdf_url":63,"venue_ids":408,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"readi","lrec2020_ws_readi","Proceedings of the 1st Workshop on Tools and Resources to Empower People with REAding DIfficulties (READI)",14,"10.63317\u002F4p5m2euxriim",{"workshop_id":414,"year":67,"full_workshop_id":415,"proceedings_title":416,"paperCount":417,"doi":418,"pdf_url":63,"venue_ids":414,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"restup","lrec2020_ws_restup","Proceedings of the Workshop on Resources and Techniques for User and Author Profiling in Abusive Language",4,"10.63317\u002F3y3vzhsp3qb7",{"workshop_id":420,"year":67,"full_workshop_id":421,"proceedings_title":422,"paperCount":423,"doi":424,"pdf_url":63,"venue_ids":420,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"signlang","lrec2020_ws_signlang","Proceedings of the LREC2020 9th Workshop on the Representation and Processing of Sign Languages: Sign Language Resources in the Service of the Language Community, Technological Challenges and Application Perspectives",36,"10.63317\u002F3nocn9xntuki",{"workshop_id":426,"year":67,"full_workshop_id":427,"proceedings_title":428,"paperCount":429,"doi":430,"pdf_url":63,"venue_ids":426,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"sltu","lrec2020_ws_sltu","Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)",52,"10.63317\u002F25p2yts6fk3q",{"workshop_id":432,"year":67,"full_workshop_id":433,"proceedings_title":434,"paperCount":293,"doi":435,"pdf_url":63,"venue_ids":432,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"stoc","lrec2020_ws_stoc","Proceedings for the First International Workshop on Social Threats in Online Conversations: Understanding and Management","10.63317\u002F4p7j6t9bjg8m",{"workshop_id":437,"year":67,"full_workshop_id":438,"proceedings_title":439,"paperCount":440,"doi":441,"pdf_url":63,"venue_ids":437,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"trac","lrec2020_ws_trac","Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying",25,"10.63317\u002F27yyhn22v2fc",{"workshop_id":443,"year":67,"full_workshop_id":444,"proceedings_title":445,"paperCount":293,"doi":446,"pdf_url":63,"venue_ids":443,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"wac","lrec2020_ws_wac","Proceedings of the 12th Web as Corpus Workshop","10.63317\u002F2va68regv5ni",{"workshop_id":448,"year":67,"full_workshop_id":449,"proceedings_title":450,"paperCount":319,"doi":451,"pdf_url":63,"venue_ids":448,"publisher":13,"editor":63,"conference_name":72,"conference_acronym":16,"conference_number":73,"conference_location":56,"conference_start_date":74,"conference_end_date":75},"wildre","lrec2020_ws_wildre","Proceedings of the WILDRE5– 5th Workshop on Indian Language Data: Resources and Evaluation","10.63317\u002F2ydivss2veo9",[453,458,463,467,472,478,483,488,494,499,504,509,514,520,525,530,535,540,545,550,554,559,564,569,573,577,582,587,593,598,603,608,613],{"workshop_id":277,"year":49,"full_workshop_id":454,"proceedings_title":455,"paperCount":286,"doi":456,"pdf_url":457,"venue_ids":277,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_bucc","Proceedings of the BUCC Workshop within LREC 2022","10.63317\u002F2mqwgvrp7zkn","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2022\u002Fworkshops\u002FBUCC\u002F2022.bucc-1.0.pdf",{"workshop_id":459,"year":49,"full_workshop_id":460,"proceedings_title":461,"paperCount":332,"doi":462,"pdf_url":63,"venue_ids":459,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"cltw","lrec2022_ws_cltw","Proceedings of the 4th Celtic Language Technology Workshop within LREC2022","10.63317\u002F3x8fjtq6m25s",{"workshop_id":303,"year":49,"full_workshop_id":464,"proceedings_title":465,"paperCount":368,"doi":466,"pdf_url":63,"venue_ids":303,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_cmlc","Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-10)","10.63317\u002F2ajestpwy3c8",{"workshop_id":468,"year":49,"full_workshop_id":469,"proceedings_title":470,"paperCount":293,"doi":471,"pdf_url":63,"venue_ids":468,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"csrnlp","lrec2022_ws_csrnlp","Proceedings of the First Computing Social Responsibility Workshop within the 13th Language Resources and Evaluation Conference","10.63317\u002F3xphwxosghv8",{"workshop_id":473,"year":49,"full_workshop_id":474,"proceedings_title":475,"paperCount":476,"doi":477,"pdf_url":63,"venue_ids":473,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"dclrl","lrec2022_ws_dclrl","Proceedings of the Workshop on Dataset Creation for Lower-Resourced Languages within the 13th Language Resources and Evaluation Conference",10,"10.63317\u002F4652bsvzarmy",{"workshop_id":479,"year":49,"full_workshop_id":480,"proceedings_title":481,"paperCount":368,"doi":482,"pdf_url":63,"venue_ids":479,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"digitam","lrec2022_ws_digitam","Proceedings of the Workshop on Processing Language Variation: Digital Armenian (DigitAm) within the 13th Language Resources and Evaluation Conference","10.63317\u002F369nz2tcm6qc",{"workshop_id":484,"year":49,"full_workshop_id":485,"proceedings_title":486,"paperCount":332,"doi":487,"pdf_url":63,"venue_ids":484,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"eurali","lrec2022_ws_eurali","Proceedings of the Workshop on Resources and Technologies for Indigenous, Endangered and Lesser-resourced Languages in Eurasia within the 13th Language Resources and Evaluation Conference","10.63317\u002F4dhjcavy7q7y",{"workshop_id":489,"year":49,"full_workshop_id":490,"proceedings_title":491,"paperCount":492,"doi":493,"pdf_url":63,"venue_ids":489,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"fnp","lrec2022_ws_fnp","Proceedings of the 4th Financial Narrative Processing Workshop @LREC2022",24,"10.63317\u002F29xpoafy85p4",{"workshop_id":495,"year":49,"full_workshop_id":496,"proceedings_title":497,"paperCount":380,"doi":498,"pdf_url":63,"venue_ids":495,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"games","lrec2022_ws_games","Proceedings of the 9th Workshop on Games and Natural Language Processing within the 13th Language Resources and Evaluation Conference","10.63317\u002F5om6f5meam4s",{"workshop_id":500,"year":49,"full_workshop_id":501,"proceedings_title":502,"paperCount":257,"doi":503,"pdf_url":63,"venue_ids":500,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"gwll","lrec2022_ws_gwll","Proceedings of Globalex Workshop on Linked Lexicography within the 13th Language Resources and Evaluation Conference","10.63317\u002F5knvvemaz9uw",{"workshop_id":336,"year":49,"full_workshop_id":505,"proceedings_title":506,"paperCount":507,"doi":508,"pdf_url":63,"venue_ids":336,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_isa","Proceedings of the 18th Joint ACL - ISO Workshop on Interoperable Semantic Annotation within LREC2022",19,"10.63317\u002F4h3ue6m3sam4",{"workshop_id":510,"year":49,"full_workshop_id":511,"proceedings_title":512,"paperCount":368,"doi":513,"pdf_url":63,"venue_ids":510,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lateraisse","lrec2022_ws_lateraisse","Proceedings of the First Workshop on Language Technology and Resources for a Fair, Inclusive, and Safe Society within the 13th Language Resources and Evaluation Conference","10.63317\u002F5osn5jjjbomp",{"workshop_id":515,"year":49,"full_workshop_id":516,"proceedings_title":517,"paperCount":518,"doi":519,"pdf_url":63,"venue_ids":515,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"law","lrec2022_ws_law","Proceedings of the 16th Linguistic Annotation Workshop (LAW-XVI) within LREC2022",20,"10.63317\u002F3fysdho22dbb",{"workshop_id":521,"year":49,"full_workshop_id":522,"proceedings_title":523,"paperCount":312,"doi":524,"pdf_url":63,"venue_ids":521,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"legal","lrec2022_ws_legal","Proceedings of the Workshop on Ethical and Legal Issues in Human Language Technologies and Multilingual De-Identification of Sensitive Data In Language Resources within the 13th Language Resources and Evaluation Conference","10.63317\u002F273whfjsjapd",{"workshop_id":371,"year":49,"full_workshop_id":526,"proceedings_title":527,"paperCount":528,"doi":529,"pdf_url":63,"venue_ids":371,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_lt4hala","Proceedings of the Second Workshop on Language Technologies for Historical and Ancient Languages",31,"10.63317\u002F3dte53mz4zvu",{"workshop_id":531,"year":49,"full_workshop_id":532,"proceedings_title":533,"paperCount":345,"doi":534,"pdf_url":63,"venue_ids":531,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"mwe","lrec2022_ws_mwe","Proceedings of the 18th Workshop on Multiword Expressions @LREC2022","10.63317\u002F2fftdmypb747",{"workshop_id":536,"year":49,"full_workshop_id":537,"proceedings_title":538,"paperCount":286,"doi":539,"pdf_url":63,"venue_ids":536,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"nidcp","lrec2022_ws_nidcp","Proceedings of the 2nd Workshop on Novel Incentives in Data Collection from People: models, implementations, challenges and results within LREC 2022","10.63317\u002F2dox4kgfq3mg",{"workshop_id":541,"year":49,"full_workshop_id":542,"proceedings_title":543,"paperCount":312,"doi":544,"pdf_url":63,"venue_ids":541,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"nlperspectives","lrec2022_ws_nlperspectives","Proceedings of the 1st Workshop on Perspectivist Approaches to NLP @LREC2022","10.63317\u002F5nzs42fwjimz",{"workshop_id":393,"year":49,"full_workshop_id":546,"proceedings_title":547,"paperCount":548,"doi":549,"pdf_url":63,"venue_ids":393,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_osact","Proceedinsg of the 5th Workshop on Open-Source Arabic Corpora and Processing Tools with Shared Tasks on Qur'an QA and Fine-Grained Hate Speech Detection",28,"10.63317\u002F4u4quhegagc5",{"workshop_id":398,"year":49,"full_workshop_id":551,"proceedings_title":552,"paperCount":507,"doi":553,"pdf_url":63,"venue_ids":398,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_parlaclarin","Proceedings of the Workshop ParlaCLARIN III within the 13th Language Resources and Evaluation Conference","10.63317\u002F4zzb69hz9ebb",{"workshop_id":555,"year":49,"full_workshop_id":556,"proceedings_title":557,"paperCount":411,"doi":558,"pdf_url":63,"venue_ids":555,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"politicalnlp","lrec2022_ws_politicalnlp","Proceedings of the LREC 2022 workshop on Natural Language Processing for Political Sciences","10.63317\u002F5h778npybpti",{"workshop_id":560,"year":49,"full_workshop_id":561,"proceedings_title":562,"paperCount":368,"doi":563,"pdf_url":63,"venue_ids":560,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"pvlam","lrec2022_ws_pvlam","Proceedings of the 2nd Workshop on People in Vision, Language, and the Mind","10.63317\u002F52rissdzp475",{"workshop_id":565,"year":49,"full_workshop_id":566,"proceedings_title":567,"paperCount":319,"doi":568,"pdf_url":63,"venue_ids":565,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"rapid","lrec2022_ws_rapid","Proceedings of the RaPID Workshop - Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive\u002Fpsychiatric\u002Fdevelopmental impairments - within the 13th Language Resources and Evaluation Conference","10.63317\u002F4jch25mm92pa",{"workshop_id":408,"year":49,"full_workshop_id":570,"proceedings_title":571,"paperCount":286,"doi":572,"pdf_url":63,"venue_ids":408,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_readi","Proceedings of the 2nd Workshop on Tools and Resources to Empower People with REAding DIfficulties (READI) within the 13th Language Resources and Evaluation Conference","10.63317\u002F56pm6ipunttk",{"workshop_id":414,"year":49,"full_workshop_id":574,"proceedings_title":575,"paperCount":417,"doi":576,"pdf_url":63,"venue_ids":414,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_restup","Proceedings of the Second International Workshop on Resources and Techniques for User Information in Abusive Language Analysis","10.63317\u002F53due7cg7w44",{"workshop_id":578,"year":49,"full_workshop_id":579,"proceedings_title":580,"paperCount":368,"doi":581,"pdf_url":63,"venue_ids":578,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"salld","lrec2022_ws_salld","Proceedings of the 2nd Workshop on Sentiment Analysis and Linguistic Linked Data","10.63317\u002F2ueor4yvpz4s",{"workshop_id":420,"year":49,"full_workshop_id":583,"proceedings_title":584,"paperCount":585,"doi":586,"pdf_url":63,"venue_ids":420,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_signlang","Proceedings of the LREC2022 10th Workshop on the Representation and Processing of Sign Languages: Multilingual Sign Language Resources",32,"10.63317\u002F2rifm6bf4efz",{"workshop_id":588,"year":49,"full_workshop_id":589,"proceedings_title":590,"paperCount":591,"doi":592,"pdf_url":63,"venue_ids":588,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"sigul","lrec2022_ws_sigul","Proceedings of the 1st Annual Meeting of the ELRA\u002FISCA Special Interest Group on Under-Resourced Languages",27,"10.63317\u002F5nb3qu29q9zi",{"workshop_id":594,"year":49,"full_workshop_id":595,"proceedings_title":596,"paperCount":507,"doi":597,"pdf_url":63,"venue_ids":594,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"sltat","lrec2022_ws_sltat","Proceedings of the 7th International Workshop on Sign Language Translation and Avatar Technology: The Junction of the Visual and the Textual: Challenges and Perspectives","10.63317\u002F3xfoevzar6ig",{"workshop_id":599,"year":49,"full_workshop_id":600,"proceedings_title":601,"paperCount":476,"doi":602,"pdf_url":63,"venue_ids":599,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"smila","lrec2022_ws_smila","Proceedings of the Workshop on Smiling and Laughter across Contexts and the Life-span within the 13th Language Resources and Evaluation Conference","10.63317\u002F47g2oou8nqdu",{"workshop_id":604,"year":49,"full_workshop_id":605,"proceedings_title":606,"paperCount":368,"doi":607,"pdf_url":63,"venue_ids":604,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"tdle","lrec2022_ws_tdle","Proceedings of the Workshop Towards Digital Language Equality within the 13th Language Resources and Evaluation Conference","10.63317\u002F3cx3opcocn9i",{"workshop_id":609,"year":49,"full_workshop_id":610,"proceedings_title":611,"paperCount":380,"doi":612,"pdf_url":63,"venue_ids":609,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"term","lrec2022_ws_term","Proceedings of the Workshop on Terminology in the 21st century: many faces, many places","10.63317\u002F23pdrqa3onr3",{"workshop_id":448,"year":49,"full_workshop_id":614,"proceedings_title":615,"paperCount":345,"doi":616,"pdf_url":63,"venue_ids":448,"publisher":13,"editor":63,"conference_name":54,"conference_acronym":16,"conference_number":55,"conference_location":56,"conference_start_date":59,"conference_end_date":60},"lrec2022_ws_wildre","Proceedings of the WILDRE-6 Workshop within the 13th Language Resources and Evaluation Conference","10.63317\u002F34agbocrmxe4",[618,623,630,638,644,649,656,663,670,677,683,690,696,703,711,717,723,729,735,742,748,755,762,768,774,780,786,792,798,805,812,818,825,831,837,843],{"workshop_id":277,"year":29,"full_workshop_id":619,"proceedings_title":620,"paperCount":312,"doi":621,"pdf_url":622,"venue_ids":277,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_bucc","Proceedings of the 17th Workshop on Building and Using Comparable Corpora (BUCC) @ LREC-COLING 2024","10.63317\u002F3tk8bqt3knqn","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fbucc\u002F2024.bucc-1.0.pdf",{"workshop_id":624,"year":29,"full_workshop_id":625,"proceedings_title":626,"paperCount":293,"doi":627,"pdf_url":628,"venue_ids":629,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"cawl","lrec2024_ws_cawl","Proceedings of the Second Workshop on Computation and Written Language (CAWL) @ LREC-COLING 2024","10.63317\u002F5jv5da4ct2px","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fcawl\u002F2024.cawl-1.0.pdf","cawl|ws",{"workshop_id":631,"year":29,"full_workshop_id":632,"proceedings_title":633,"paperCount":634,"doi":635,"pdf_url":636,"venue_ids":637,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"cl4health","lrec2024_ws_cl4health","Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024",33,"10.63317\u002F3keuurbv54de","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fpolp\u002F2024.cl4health-1.0.pdf","cl4health|ws",{"workshop_id":639,"year":29,"full_workshop_id":640,"proceedings_title":641,"paperCount":507,"doi":642,"pdf_url":643,"venue_ids":639,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"cogalex","lrec2024_ws_cogalex","Proceedings of the Workshop on Cognitive Aspects of the Lexicon @ LREC-COLING 2024","10.63317\u002F2gq7359pqznx","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fdelite\u002F2024.delite-1.0.pdf",{"workshop_id":645,"year":29,"full_workshop_id":646,"proceedings_title":647,"paperCount":380,"doi":648,"pdf_url":643,"venue_ids":645,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"delite","lrec2024_ws_delite","Proceedings of the First Workshop on Language-driven Deliberation Technology (DELITE) @ LREC-COLING 2024","10.63317\u002F3pcpupr4j9wb",{"workshop_id":650,"year":29,"full_workshop_id":651,"proceedings_title":652,"paperCount":332,"doi":653,"pdf_url":654,"venue_ids":655,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"determit","lrec2024_ws_determit","Proceedings of the Workshop on DeTermIt! Evaluating Text Difficulty in a Multilingual Context @ LREC-COLING 2024","10.63317\u002F32qtrrr46eau","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fdetermit\u002F2024.determit-1.0.pdf","determit|ws",{"workshop_id":657,"year":29,"full_workshop_id":658,"proceedings_title":659,"paperCount":293,"doi":660,"pdf_url":661,"venue_ids":662,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"dlnld","lrec2024_ws_dlnld","Proceedings of the Workshop on Deep Learning and Linked Data (DLnLD) @ LREC-COLING 2024","10.63317\u002F543pjjgkbst9","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fdlnld\u002F2024.dlnld-1.0.pdf","dlnld|ws",{"workshop_id":664,"year":29,"full_workshop_id":665,"proceedings_title":666,"paperCount":345,"doi":667,"pdf_url":668,"venue_ids":669,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"dmr","lrec2024_ws_dmr","Proceedings of the Fifth International Workshop on Designing Meaning Representations @ LREC-COLING 2024","10.63317\u002F5q4wbidauaxn","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fdmr\u002F2024.dmr-1.0.pdf","dmr|ws",{"workshop_id":671,"year":29,"full_workshop_id":672,"proceedings_title":673,"paperCount":312,"doi":674,"pdf_url":675,"venue_ids":676,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"ecnlp","lrec2024_ws_ecnlp","Proceedings of the Seventh Workshop on e-Commerce and NLP @ LREC-COLING 2024","10.63317\u002F4upp3i6m57nt","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fecnlp\u002F2024.ecnlp-1.0.pdf","ecnlp|ws",{"workshop_id":484,"year":29,"full_workshop_id":678,"proceedings_title":679,"paperCount":293,"doi":680,"pdf_url":681,"venue_ids":682,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_eurali","Proceedings of the 2nd Workshop on Resources and Technologies for Indigenous, Endangered and Lesser-resourced Languages in Eurasia (EURALI) @ LREC-COLING 2024","10.63317\u002F3z633pd4tyg2","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Feurali\u002F2024.eurali-1.0.pdf","eurali|ws",{"workshop_id":684,"year":29,"full_workshop_id":685,"proceedings_title":686,"paperCount":687,"doi":688,"pdf_url":689,"venue_ids":684,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"finnlp","lrec2024_ws_finnlp","Proceedings of the Joint Workshop of the 7th Financial Technology and Natural Language Processing, the 5th Knowledge Discovery from Unstructured Data in Financial Services, and the 4th Workshop on Economics and Natural Language Processing",34,"10.63317\u002F46uvxxoj8prq","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Ffinnlp\u002F2024.finnlp-1.0.pdf",{"workshop_id":495,"year":29,"full_workshop_id":691,"proceedings_title":692,"paperCount":319,"doi":693,"pdf_url":694,"venue_ids":695,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_games","Proceedings of the 10th Workshop on Games and Natural Language Processing @ LREC-COLING 2024","10.63317\u002F4d46836qy76p","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fgames\u002F2024.games-1.0.pdf","games|ws",{"workshop_id":697,"year":29,"full_workshop_id":698,"proceedings_title":699,"paperCount":286,"doi":700,"pdf_url":701,"venue_ids":702,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"htres","lrec2024_ws_htres","Proceedings of the First Workshop on Holocaust Testimonies as Language Resources (HTRes) @ LREC-COLING 2024","10.63317\u002F47iakwwytvs8","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fhtres\u002F2024.htres-1.0.pdf","htres|ws",{"workshop_id":704,"year":29,"full_workshop_id":705,"proceedings_title":706,"paperCount":707,"doi":708,"pdf_url":709,"venue_ids":710,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"humeval","lrec2024_ws_humeval","Proceedings of the Fourth Workshop on Human Evaluation of NLP Systems (HumEval) @ LREC-COLING 2024",26,"10.63317\u002F3jfrug2yvkgc","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fhumeval\u002F2024.humeval-1.0.pdf","humeval|ws",{"workshop_id":336,"year":29,"full_workshop_id":712,"proceedings_title":713,"paperCount":332,"doi":714,"pdf_url":715,"venue_ids":716,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_isa","Proceedings of the 20th Joint ACL - ISO Workshop on Interoperable Semantic Annotation @ LREC-COLING 2024","10.63317\u002F5g5ddg8i3y47","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fisa\u002F2024.isa-1.0.pdf","isa|ws",{"workshop_id":349,"year":29,"full_workshop_id":718,"proceedings_title":719,"paperCount":312,"doi":720,"pdf_url":721,"venue_ids":722,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_ldl","Proceedings of the 9th Workshop on Linked Data in Linguistics @ LREC-COLING 2024","10.63317\u002F4gz96nfw2gdk","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fldl\u002F2024.ldl-1.0.pdf","ldl|ws",{"workshop_id":521,"year":29,"full_workshop_id":724,"proceedings_title":725,"paperCount":266,"doi":726,"pdf_url":727,"venue_ids":728,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_legal","Proceedings of the Workshop on Legal and Ethical Issues in Human Language Technologies @ LREC-COLING 2024","10.63317\u002F2wkziwv5fb97","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Flegal\u002F2024.legal-1.0.pdf","legal|ws",{"workshop_id":371,"year":29,"full_workshop_id":730,"proceedings_title":731,"paperCount":634,"doi":732,"pdf_url":733,"venue_ids":734,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_lt4hala","Proceedings of the Third Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) @ LREC-COLING-2024","10.63317\u002F2vavxjcscp8z","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Flt4hala\u002F2024.lt4hala-1.0.pdf","lt4hala|ws",{"workshop_id":736,"year":29,"full_workshop_id":737,"proceedings_title":738,"paperCount":273,"doi":739,"pdf_url":740,"venue_ids":741,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"mathnlp","lrec2024_ws_mathnlp","Proceedings of the 2nd Workshop on Mathematical Natural Language Processing @ LREC-COLING 2024","10.63317\u002F2ydwrzo67zpj","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fmathnlp\u002F2024.mathnlp-1.0.pdf","mathnlp|ws",{"workshop_id":531,"year":29,"full_workshop_id":743,"proceedings_title":744,"paperCount":591,"doi":745,"pdf_url":746,"venue_ids":747,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_mwe","Proceedings of the Joint Workshop on Multiword Expressions and Universal Dependencies (MWE-UD) @ LREC-COLING 2024","10.63317\u002F42csaq87z39r","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fmwe\u002F2024.mweud-1.0.pdf","mwe|udw|ws",{"workshop_id":749,"year":29,"full_workshop_id":750,"proceedings_title":751,"paperCount":273,"doi":752,"pdf_url":753,"venue_ids":754,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"neusymbridge","lrec2024_ws_neusymbridge","Proceedings of the Workshop: Bridging Neurons and Symbols for Natural Language Processing and Knowledge Graphs Reasoning (NeusymBridge) @ LREC-COLING-2024","10.63317\u002F2vsheftp3ti9","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fneusymbridge\u002F2024.neusymbridge-1.0.pdf","neusymbridge|ws",{"workshop_id":541,"year":29,"full_workshop_id":756,"proceedings_title":757,"paperCount":758,"doi":759,"pdf_url":760,"venue_ids":761,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_nlperspectives","Proceedings of the 3rd Workshop on Perspectivist Approaches to NLP (NLPerspectives) @ LREC-COLING 2024",16,"10.63317\u002F2cojnfknheph","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fnlperspectives\u002F2024.nlperspectives-1.0.pdf","nlperspectives|ws",{"workshop_id":393,"year":29,"full_workshop_id":763,"proceedings_title":764,"paperCount":345,"doi":765,"pdf_url":766,"venue_ids":767,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_osact","Proceedings of the 6th Workshop on Open-Source Arabic Corpora and Processing Tools (OSACT) with Shared Tasks on Arabic LLMs Hallucination and Dialect to MSA Machine Translation @ LREC-COLING 2024","10.63317\u002F5d5qxytkajay","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fosact\u002F2024.osact-1.0.pdf","osact|ws",{"workshop_id":398,"year":29,"full_workshop_id":769,"proceedings_title":770,"paperCount":440,"doi":771,"pdf_url":772,"venue_ids":773,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_parlaclarin","Proceedings of the IV Workshop on Creating, Analysing, and Increasing Accessibility of Parliamentary Corpora (ParlaCLARIN) @ LREC-COLING 2024","10.63317\u002F46c8xka7m8f7","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fparlaclarin\u002F2024.parlaclarin-1.0.pdf","parlaclarin|ws",{"workshop_id":555,"year":29,"full_workshop_id":775,"proceedings_title":776,"paperCount":476,"doi":777,"pdf_url":778,"venue_ids":779,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_politicalnlp","Proceedings of the Second Workshop on Natural Language Processing for Political Sciences @ LREC-COLING 2024","10.63317\u002F3qf3r8pwtkvp","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fpoliticalnlp\u002F2024.politicalnlp-1.0.pdf","politicalnlp|ws",{"workshop_id":403,"year":29,"full_workshop_id":781,"proceedings_title":782,"paperCount":345,"doi":783,"pdf_url":784,"venue_ids":785,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_rail","Proceedings of the Fifth Workshop on Resources for African Indigenous Languages @ LREC-COLING 2024","10.63317\u002F2iyqymd34fup","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Frail\u002F2024.rail-1.0.pdf","rail|ws",{"workshop_id":565,"year":29,"full_workshop_id":787,"proceedings_title":788,"paperCount":266,"doi":789,"pdf_url":790,"venue_ids":791,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_rapid","Proceedings of the Fifth Workshop on Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive\u002Fpsychiatric\u002Fdevelopmental impairments @LREC-COLING 2024","10.63317\u002F5pc4wtot6r3x","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Frapid\u002F2024.rapid-1.0.pdf","rapid|ws",{"workshop_id":408,"year":29,"full_workshop_id":793,"proceedings_title":794,"paperCount":286,"doi":795,"pdf_url":796,"venue_ids":797,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_readi","Proceedings of the 3rd Workshop on Tools and Resources for People with REAding DIfficulties (READI) @ LREC-COLING 2024","10.63317\u002F4b546asxrjr6","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Freadi\u002F2024.readi-1.0.pdf","readi|ws",{"workshop_id":799,"year":29,"full_workshop_id":800,"proceedings_title":801,"paperCount":273,"doi":802,"pdf_url":803,"venue_ids":804,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"rfp","lrec2024_ws_rfp","Proceedings of the First Workshop on Reference, Framing, and Perspective @ LREC-COLING 2024","10.63317\u002F4xwx3twp9qoy","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Frfp\u002F2024.rfp-1.0.pdf","rfp|ws",{"workshop_id":806,"year":29,"full_workshop_id":807,"proceedings_title":808,"paperCount":273,"doi":809,"pdf_url":810,"venue_ids":811,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"safety4convai","lrec2024_ws_safety4convai","Proceedings of Safety4ConvAI: The Third Workshop on Safety for Conversational AI @ LREC-COLING 2024","10.63317\u002F4johe7jpagg6","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fsafeai\u002F2024.safety4convai-1.0.pdf","safety4convai|ws",{"workshop_id":420,"year":29,"full_workshop_id":813,"proceedings_title":814,"paperCount":815,"doi":816,"pdf_url":817,"venue_ids":420,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_signlang","Proceedings of the LREC-COLING 2024 11th Workshop on the Representation and Processing of Sign Languages: Evaluation of Sign Language Resources",45,"10.63317\u002F4e7aayu2htd6","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fsignlang\u002F2024.signlang-1.0.pdf",{"workshop_id":588,"year":29,"full_workshop_id":819,"proceedings_title":820,"paperCount":821,"doi":822,"pdf_url":823,"venue_ids":824,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_sigul","Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",50,"10.63317\u002F55wjiy53vy99","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fsigul\u002F2024.sigul-1.0.pdf","sigul|ws",{"workshop_id":604,"year":29,"full_workshop_id":826,"proceedings_title":827,"paperCount":368,"doi":828,"pdf_url":829,"venue_ids":830,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_tdle","Proceedings of the Second International Workshop Towards Digital Language Equality (TDLE): Focusing on Sustainability @ LREC-COLING 2024","10.63317\u002F3p5nrhhwdhbe","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Ftdle\u002F2024.tdle-1.0.pdf","tdle|ws",{"workshop_id":437,"year":29,"full_workshop_id":832,"proceedings_title":833,"paperCount":345,"doi":834,"pdf_url":835,"venue_ids":836,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_trac","Proceedings of the Fourth Workshop on Threat, Aggression & Cyberbullying @ LREC-COLING-2024","10.63317\u002F2ev2ox49nijy","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Ftrac\u002F2024.trac-1.0.pdf","trac|ws",{"workshop_id":838,"year":29,"full_workshop_id":839,"proceedings_title":840,"paperCount":758,"doi":841,"pdf_url":842,"venue_ids":838,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"unlp","lrec2024_ws_unlp","Proceedings of the Third Ukrainian Natural Language Processing Workshop (UNLP) @ LREC-COLING 2024","10.63317\u002F5bwu58575ghh","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Funlp\u002F2024.unlp-1.0.pdf",{"workshop_id":448,"year":29,"full_workshop_id":844,"proceedings_title":845,"paperCount":266,"doi":846,"pdf_url":847,"venue_ids":848,"publisher":34,"editor":63,"conference_name":36,"conference_acronym":37,"conference_number":38,"conference_location":39,"conference_start_date":42,"conference_end_date":43},"lrec2024_ws_wildre","Proceedings of the 7th Workshop on Indian Language Data: Resources and Evaluation","10.63317\u002F52j5bum2j3fk","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec-coling-2024\u002Fwildre\u002F2024.wildre-1.0.pdf","wildre|ws",{"conference_id":80,"year":81,"proceedings_title":82,"venue_ids":9,"isbn":83,"issn":11,"doi":84,"publisher":13,"editors":85,"conference_name":86,"conference_acronym":16,"conference_number":87,"conference_location":88,"conference_city":89,"conference_country":90,"conference_start_date":91,"conference_end_date":92,"conference_url":93,"pdf_url":94,"img_conf_url":63,"paperCount":95},{"conference_id":80,"year":81,"proceedings_title":82,"venue_ids":9,"isbn":83,"issn":11,"doi":84,"publisher":13,"editors":85,"conference_name":86,"conference_acronym":16,"conference_number":87,"conference_location":88,"conference_city":89,"conference_country":90,"conference_start_date":91,"conference_end_date":92,"conference_url":93,"pdf_url":94,"img_conf_url":63,"paperCount":95},[852,872,895,910,931,952,967,981,1004,1025,1040,1061,1091,1112,1139,1175,1190,1235,1254,1287,1305,1323,1335,1350,1373,1412,1427,1439,1454,1480,1493,1514,1529,1543,1557,1575,1593,1629,1647,1667,1691,1712,1743,1762,1788,1803,1816,1828,1849,1867,1885,1900,1915,1930,1944,1962,1975,1993,2005,2017,2035,2056,2071,2099,2121,2138,2160,2175,2190,2226,2244,2259,2274,2320,2338,2359,2371,2386,2415,2440,2454,2477,2492,2507,2521,2545,2624,2638,2659,2675,2687,2705,2724,2744,2760,2778,2790,2807,2822,2837,2852,2867,2885,2921,2937,2955,2966,2987,3002,3019,3033,3048,3065,3080,3090,3106,3123,3137,3160,3175,3215,3229,3241,3257,3274,3291,3305,3317,3331,3348,3361,3388,3402,3417,3434,3446,3464,3476,3493,3513,3527,3548,3569,3585,3601,3621,3634,3654,3672,3693,3707,3725,3740,3760,3779,3790,3806,3821,3840,3858,3875,3897,3918,3934,3954,3971,3987,4000,4019,4041,4056,4069,4081,4096,4113,4128,4140,4155,4177,4198,4213,4223,4243,4269,4287,4300,4313,4326,4347,4363,4381,4394,4411,4432,4457,4469,4487,4498,4521,4534,4553,4567,4588,4605,4623,4641,4676,4696,4720,4750,4768,4794,4806,4830,4854,4872,4899,4914,4925,4943,4958,4975,4990,5002,5018,5036,5056,5069,5087,5104,5117,5140,5158,5176,5196,5222,5238,5254,5276,5288,5303,5318,5333,5348,5363,5383,5401,5418,5432,5448,5477,5502,5523,5538,5556,5587,5605,5619,5643,5656,5676,5700,5719,5733,5756,5773,5794,5813,5829,5843,5854,5870,5886,5902,5923,5940,5963,5987,6000,6024,6037,6051,6064,6080,6098,6117,6133,6163,6176,6192,6211,6229,6241,6280,6300,6310,6325,6341,6360,6399,6413,6429,6448,6463,6481,6497,6511,6530,6549,6563,6580,6593,6605,6620,6637,6655,6681,6698,6713,6734,6750,6762,6780,6794,6808,6820,6837,6850,6865,6888,6900,6938,6948,6961,6979,6992,7010,7022,7043,7063,7077,7088,7100,7119,7130,7145,7159,7177,7193,7208,7224,7237,7252,7265,7282,7306,7326,7341,7353,7371,7394,7418,7435,7450,7461,7473,7492,7510,7532,7551,7576,7590,7612,7626,7654,7679,7693,7704,7727,7745,7763,7779,7790,7804,7820,7835,7858,7878,7890,7910,7921,7936,7957,7974,7992,8013,8027,8050,8073,8086,8102,8115,8132,8145,8160,8175,8190,8203,8232,8249,8269,8290,8303,8316,8328,8351,8365,8393,8405,8420,8433,8447,8467,8483,8501,8523,8544,8561,8579,8596,8619,8639,8653,8672,8695,8716,8731,8748,8772,8787,8802,8816,8833,8851,8867,8887,8899,8917,8930,8942,8963,8977,8989,9018,9030,9051,9077,9099,9113,9129,9143,9154,9171,9196,9210,9228,9240,9267,9286,9305,9321,9334,9349,9362,9375,9387,9399,9415,9437,9452,9470,9491,9504,9520,9534,9549,9564,9580,9597,9615,9629,9649,9667,9683,9702,9723,9738,9756,9769,9785,9797,9812,9827,9845,9865,9881,9899,9915,9930,9945,9956,9973,9990,10005,10020,10042,10060,10078,10094,10107,10125,10143,10158,10179,10196,10226,10240,10259,10287,10307,10329,10367,10383,10405,10417,10444,10458,10471,10482,10495,10508,10521,10533,10550,10578,10592,10608,10621,10637,10655,10670,10690,10703,10721,10740,10754,10771,10790,10809,10828,10841,10857,10870,10882,10896,10915,10930,10946,10960,10973,10991,11004,11021,11037,11073,11091,11111,11127,11137,11156,11168,11179,11195,11209,11225,11239,11254,11268,11279,11295,11317,11338,11353,11371,11390,11405,11420,11439,11460,11476,11493,11510,11523,11539,11553,11567,11585,11602,11628,11641,11666,11691,11702,11713,11727,11743,11761,11791,11805,11848,11863,11877,11896,11910,11926,11938,11966,11987,12010,12022,12039,12052,12076,12106,12117,12135,12152,12169,12184,12200,12221,12232,12245,12260,12277,12290,12306,12318,12335,12348,12360,12375,12389,12405,12416,12434,12451,12465,12480,12495,12510,12535,12548,12570,12585,12601,12637,12652,12668,12682,12697,12711,12740,12790,12811,12831,12847,12868,12879,12897,12911,12929,12940,12960,12975,12987,13005,13019,13035,13051,13072,13089,13106,13121,13136,13146,13161,13174,13190,13203,13229,13243,13257,13280,13295,13308,13326,13343,13357,13370,13393,13404,13427,13441,13470,13484,13500,13515,13530,13550,13564,13580,13594,13609,13622,13634,13648,13660],{"paper_id":853,"title":854,"year":81,"month":855,"day":63,"doi":856,"resource_url":857,"first_page":63,"last_page":63,"pdf_url":858,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":859,"paper_type":860,"authors":861,"abstract":871},"lrec2018-main-001","Augmenting Librispeech with French Translations: A Multimodal Corpus for Direct Speech Translation Evaluation","05","10.63317\u002F4tgpdx9xax4j","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-001","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F621.pdf","kocabiyikoglu-etal-2018-augmenting","main",[862,865,868],{"paper_id":853,"author_seq":247,"given_name":863,"surname":864,"affiliation":63,"orcid":63},"Ali Can","Kocabiyikoglu",{"paper_id":853,"author_seq":232,"given_name":866,"surname":867,"affiliation":63,"orcid":63},"Laurent","Besacier",{"paper_id":853,"author_seq":218,"given_name":869,"surname":870,"affiliation":63,"orcid":63},"Olivier","Kraif","Recent works in spoken language translation (SLT) have attempted to build end-to-end speech-to-text translation without using source language transcription during learning or decoding. However, while large quantities of parallel texts (such as Europarl, OpenSubtitles) are available for training machine translation systems, there are no large (> 100h) and open source parallel corpora that include speech in a source language aligned to text in a target language.  This paper tries to fill this gap by augmenting an existing (monolingual) corpus: LibriSpeech. This corpus, used for automatic speech recognition,  is derived from read audiobooks from the LibriVox project, and has been carefully segmented and aligned. After gathering French e-books corresponding to the English audio-books from LibriSpeech,  we align speech segments at the sentence level with their respective translations and obtain 236h of usable parallel data. This paper presents the details of the processing as well as a manual evaluation conducted on a small subset of the corpus. This evaluation shows that the automatic alignments scores are reasonably correlated with the human judgments of the bilingual alignment quality. We believe that this corpus (which is made available online) is useful for replicable experiments in direct speech translation or more general spoken language translation experiments.",{"paper_id":873,"title":874,"year":81,"month":855,"day":63,"doi":875,"resource_url":876,"first_page":63,"last_page":63,"pdf_url":877,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":878,"paper_type":860,"authors":879,"abstract":894},"lrec2018-main-002","Evaluating Domain Adaptation for Machine Translation Across Scenarios","10.63317\u002F4dcgzoc6ozms","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-002","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F568.pdf","etchegoyhen-etal-2018-evaluating",[880,883,886,889,892],{"paper_id":873,"author_seq":247,"given_name":881,"surname":882,"affiliation":63,"orcid":63},"Thierry","Etchegoyhen",{"paper_id":873,"author_seq":232,"given_name":884,"surname":885,"affiliation":63,"orcid":63},"Anna","Fernández Torné",{"paper_id":873,"author_seq":218,"given_name":887,"surname":888,"affiliation":63,"orcid":63},"Andoni","Azpeitia",{"paper_id":873,"author_seq":203,"given_name":890,"surname":891,"affiliation":63,"orcid":63},"Eva","Martínez Garcia",{"paper_id":873,"author_seq":188,"given_name":884,"surname":893,"affiliation":63,"orcid":63},"Matamala","We present an evaluation of the benefits of domain adaptation for machine translation, on three separate domains and language pairs, with varying degrees of domain specificity and amounts of available training data. Domain-adapted statistical and neural machine translation systems are compared to each other and to generic online systems, thus providing an evaluation of the main options in terms of machine translation. Alongside automated translation metrics, we present experimental results involving professional translators, in terms of quality assessment, subjective evaluations of the task and post-editing productivity measurements. The results we present quantify the clear advantages of domain adaptation for machine translation, with marked impacts for domains with higher specificity. Additionally, the results of the experiments show domain-adapted neural machine translation systems to be the optimal choice overall.",{"paper_id":896,"title":897,"year":81,"month":855,"day":63,"doi":898,"resource_url":899,"first_page":63,"last_page":63,"pdf_url":900,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":901,"paper_type":860,"authors":902,"abstract":909},"lrec2018-main-003","Upping the Ante: Towards a Better Benchmark for Chinese-to-English Machine Translation","10.63317\u002F4mvje9bfx9j2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-003","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F678.pdf","hadiwinoto-ng-2018-upping",[903,906],{"paper_id":896,"author_seq":247,"given_name":904,"surname":905,"affiliation":63,"orcid":63},"Christian","Hadiwinoto",{"paper_id":896,"author_seq":232,"given_name":907,"surname":908,"affiliation":63,"orcid":63},"Hwee Tou","Ng","There are many machine translation (MT) papers that propose novel approaches and show improvements over their self-defined baselines. The experimental setting in each paper often differs from one another. As such, it is hard to determine if a proposed approach is really useful and advances the state of the art. Chinese-to-English translation is a common translation direction in MT papers, although there is not one widely accepted experimental setting in Chinese-to-English MT. Our goal in this paper is to propose a benchmark in evaluation setup for Chinese-to-English machine translation, such that the effectiveness of a new proposed MT approach can be directly compared to previous approaches. Towards this end, we also built a highly competitive state-of-the-art MT system trained on a large-scale training set. Our system outperforms reported results on NIST OpenMT test sets in almost all papers published in major conferences and journals in computational linguistics and artificial intelligence in the past 11 years. We argue that a standardized benchmark on data and performance is important for meaningful comparison.",{"paper_id":911,"title":912,"year":81,"month":855,"day":63,"doi":913,"resource_url":914,"first_page":63,"last_page":63,"pdf_url":915,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":916,"paper_type":860,"authors":917,"abstract":930},"lrec2018-main-004","ESCAPE: a Large-scale Synthetic Corpus for Automatic Post-Editing","10.63317\u002F5muiiadpkbwz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-004","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F282.pdf","negri-etal-2018-escape",[918,921,924,927],{"paper_id":911,"author_seq":247,"given_name":919,"surname":920,"affiliation":63,"orcid":63},"Matteo","Negri",{"paper_id":911,"author_seq":232,"given_name":922,"surname":923,"affiliation":63,"orcid":63},"Marco","Turchi",{"paper_id":911,"author_seq":218,"given_name":925,"surname":926,"affiliation":63,"orcid":63},"Rajen","Chatterjee",{"paper_id":911,"author_seq":203,"given_name":928,"surname":929,"affiliation":63,"orcid":63},"Nicola","Bertoldi","Training models for the automatic correction of machine-translated text usually relies on data consisting of (source,MT, human post-edit) triplets providing, for each source sentence, examples of translation errors with the corresponding corrections made by a human post-editor. Ideally, a large amount of data of this kind should allow the model to learn reliable correction patterns and effectively apply them at test stage on unseen (source, MT) pairs. In practice, however, their limited availability calls for solutions that also integrate in the training process other sources of knowledge. Along this direction, state-of-the-art results have been recently achieved by systems that, in addition to a limited amount of available training data, exploit artificial corpora that approximate elements of the “gold” training instances with automatic translations. Following this idea, we present eSCAPE, the largest freely-available Synthetic Corpus for Automatic Post-Editing released so far. eSCAPE consists of millions of entries in which the MTelement of the training triplets has been obtained by translating the source side of publicly-available parallel corpora, and using the target side as an artificial human post-edit.Translations are obtained both with phrase-based and neural models. For each MT paradigm, eSCAPE contains 7.2 million triplets forEnglish–German and 3.3 millions for English–Italian, resulting in a total of 14,4 and 6,6 million instances respectively. The usefulness of eSCAPE is proved through experiments in a general-domain scenario, the most challenging one for automatic post-editing. For both language directions, the models trained on our artificial data always improve MT quality with statistically significant gains.",{"paper_id":932,"title":933,"year":81,"month":855,"day":63,"doi":934,"resource_url":935,"first_page":63,"last_page":63,"pdf_url":936,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":937,"paper_type":860,"authors":938,"abstract":951},"lrec2018-main-005","Evaluating Machine Translation Performance on Chinese Idioms with a Blacklist Method","10.63317\u002F53akuhvjzmxb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-005","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F381.pdf","shao-etal-2018-evaluating",[939,942,945,948],{"paper_id":932,"author_seq":247,"given_name":940,"surname":941,"affiliation":63,"orcid":63},"Yutong","Shao",{"paper_id":932,"author_seq":232,"given_name":943,"surname":944,"affiliation":63,"orcid":63},"Rico","Sennrich",{"paper_id":932,"author_seq":218,"given_name":946,"surname":947,"affiliation":63,"orcid":63},"Bonnie","Webber",{"paper_id":932,"author_seq":203,"given_name":949,"surname":950,"affiliation":63,"orcid":63},"Federico","Fancellu","Idiom translation is a challenging problem in machine translation because the meaning of idioms is non-compositional, and a literal (word-by-word) translation is likely to be wrong. In this paper, we focus on evaluating the quality of idiom translation of MT systems. We introduce a new evaluation method based on an idiom-specific blacklist of literal translations, based on the insight that the occurrence of any blacklisted words in the translation output indicates a likely translation error. We introduce a dataset, CIBB (Chinese Idioms Blacklists Bank), and perform an evaluation of a state-of-the-art Chinese→English neural MT system. Our evaluation confirms that a sizable number of idioms in our test set are mistranslated (46.1%), that literal translation error is a common error type, and that our blacklist method is effective at identifying literal translation errors.",{"paper_id":953,"title":954,"year":81,"month":855,"day":63,"doi":955,"resource_url":956,"first_page":63,"last_page":63,"pdf_url":957,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":958,"paper_type":860,"authors":959,"abstract":966},"lrec2018-main-006","Network Features Based Co-hyponymy Detection","10.63317\u002F2ozwzkvuosdw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-006","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F78.pdf","jana-goyal-2018-network",[960,963],{"paper_id":953,"author_seq":247,"given_name":961,"surname":962,"affiliation":63,"orcid":63},"Abhik","Jana",{"paper_id":953,"author_seq":232,"given_name":964,"surname":965,"affiliation":63,"orcid":63},"Pawan","Goyal","Distinguishing lexical relations has been a long term pursuit in natural language processing (NLP) domain. Recently, in order to detect lexical relations like hypernymy, meronymy, co-hyponymy etc., distributional semantic models are being used extensively in some form or the other. Even though a lot of efforts have been made for detecting hypernymy relation, the problem of co-hyponymy detection has been rarely investigated. In this paper, we are proposing a novel supervised model where various network measures have been utilized to identify co-hyponymy relation with high accuracy performing better or at par with the state-of-the-art models.",{"paper_id":968,"title":969,"year":81,"month":855,"day":63,"doi":970,"resource_url":971,"first_page":63,"last_page":63,"pdf_url":972,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":973,"paper_type":860,"authors":974,"abstract":980},"lrec2018-main-007","Cross-Lingual Generation and Evaluation of a Wide-Coverage Lexical Semantic Resource","10.63317\u002F4z6kwyxy6i6c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-007","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F174.pdf","novak-novak-2018-cross",[975,978],{"paper_id":968,"author_seq":247,"given_name":976,"surname":977,"affiliation":63,"orcid":63},"Attila","Novák",{"paper_id":968,"author_seq":232,"given_name":979,"surname":977,"affiliation":63,"orcid":63},"Borbála","Neural word embedding models trained on sizable corpora have proved to be a very efficient means of representing meaning. However, the abstract vectors representing words and phrases in these models are not interpretable for humans by themselves. In this paper we present the Thing Recognizer, a method that assigns explicit symbolic semantic features from a finite list of terms to words present in an embedding model, making the model interpretable for humans and covering the semantic space by a controlled vocabulary of semantic features. We do this in a cross-lingual manner, applying semantic tags taken form lexical resources in one language (English) to the embedding space of another (Hungarian)",{"paper_id":982,"title":983,"year":81,"month":855,"day":63,"doi":984,"resource_url":985,"first_page":63,"last_page":63,"pdf_url":986,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":987,"paper_type":860,"authors":988,"abstract":1003},"lrec2018-main-008","Advances in Pre-Training Distributed Word Representations","10.63317\u002F4b3prw5a5tze","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-008","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F721.pdf","mikolov-etal-2018-advances",[989,992,995,998,1000],{"paper_id":982,"author_seq":247,"given_name":990,"surname":991,"affiliation":63,"orcid":63},"Tomas","Mikolov",{"paper_id":982,"author_seq":232,"given_name":993,"surname":994,"affiliation":63,"orcid":63},"Edouard","Grave",{"paper_id":982,"author_seq":218,"given_name":996,"surname":997,"affiliation":63,"orcid":63},"Piotr","Bojanowski",{"paper_id":982,"author_seq":203,"given_name":904,"surname":999,"affiliation":63,"orcid":63},"Puhrsch",{"paper_id":982,"author_seq":188,"given_name":1001,"surname":1002,"affiliation":63,"orcid":63},"Armand","Joulin","Many Natural Language Processing applications nowadays rely on pre-trained word representations estimated from large text corpora such as news collections, Wikipedia and Web Crawl. In this paper, we show how to train high-quality word vector representations by using a combination of known tricks that are however rarely used together. The main result of our work is the new set of publicly available pre-trained models that outperform the current state of the art by a large margin on a number of tasks.",{"paper_id":1005,"title":1006,"year":81,"month":855,"day":63,"doi":1007,"resource_url":1008,"first_page":63,"last_page":63,"pdf_url":1009,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1010,"paper_type":860,"authors":1011,"abstract":1024},"lrec2018-main-009","Integrating Generative Lexicon Event Structures into VerbNet","10.63317\u002F2vxw58tvfbm4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-009","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1056.pdf","brown-etal-2018-integrating",[1012,1015,1018,1021],{"paper_id":1005,"author_seq":247,"given_name":1013,"surname":1014,"affiliation":63,"orcid":63},"Susan Windisch","Brown",{"paper_id":1005,"author_seq":232,"given_name":1016,"surname":1017,"affiliation":63,"orcid":63},"James","Pustejovsky",{"paper_id":1005,"author_seq":218,"given_name":1019,"surname":1020,"affiliation":63,"orcid":63},"Annie","Zaenen",{"paper_id":1005,"author_seq":203,"given_name":1022,"surname":1023,"affiliation":63,"orcid":63},"Martha","Palmer","This paper focuses on specific changes to the semantic representations associated with classes of verbs in the English lexical resource VerbNet. The new form has been restricted to first-order representations to simplify use by and integration with planners. More significantly, the modifications incorporate the Generative Lexicon's event structure, with temporal ordering of subevents associated with explicit predications over the verb's arguments. These changes allow for greater flexibility in representing complex events, for a more consistent treatment of the oppositions inherent in change-of-state classes, and for a more nuanced portrayal of the Agent's role.",{"paper_id":1026,"title":1027,"year":81,"month":855,"day":63,"doi":1028,"resource_url":1029,"first_page":63,"last_page":63,"pdf_url":1030,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1031,"paper_type":860,"authors":1032,"abstract":1039},"lrec2018-main-010","FontLex: A Typographical Lexicon based on Affective Associations","10.63317\u002F59okd4eav957","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-010","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1059.pdf","kulahcioglu-de-melo-2018-fontlex",[1033,1036],{"paper_id":1026,"author_seq":247,"given_name":1034,"surname":1035,"affiliation":63,"orcid":63},"Tugba","Kulahcioglu",{"paper_id":1026,"author_seq":232,"given_name":1037,"surname":1038,"affiliation":63,"orcid":63},"Gerard","de Melo","The task of selecting suitable fonts for a given text is non-trivial, as tens of thousands of fonts are available, and the choice of font has been shown to affect the perception of the text as well as of the author or of the brand being advertized. Aiming to support the development of font recommendation tools, we create a typographical lexicon providing associations between words and fonts. We achieve this by means of affective evocations, making use of font--emotion and word--emotion relationships. For this purpose, we first determine font vectors for a set of ten emotion attributes, based on word similarities and antonymy information. We evaluate these associations through a user study via Mechanical Turk, which, for eight of the ten emotions, shows a strong user preference towards the fonts that are found to be congruent by our predicted data. Subsequently, this data is used to calculate font vectors for specific words, by relying on the emotion associations of a given word. This leads to a set of font associations for 6.4K words. We again evaluate the resulting dataset using Mechanical Turk, on 25 randomly sampled words. For the majority of these words, the responses indicate that fonts with strong associations are preferred, and for all except 2 words, fonts with weak associations are dispreferred. Finally, we further extend the dataset using synonyms of font attributes and emotion names. The resulting FontLex resource provides mappings between 6.7K words and 200 fonts.",{"paper_id":1041,"title":1042,"year":81,"month":855,"day":63,"doi":1043,"resource_url":1044,"first_page":63,"last_page":63,"pdf_url":1045,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1046,"paper_type":860,"authors":1047,"abstract":1060},"lrec2018-main-011","Multi-layer Annotation of the Rigveda","10.63317\u002F3djvpxc2ghs3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-011","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F274.pdf","hellwig-etal-2018-multi",[1048,1051,1054,1057],{"paper_id":1041,"author_seq":247,"given_name":1049,"surname":1050,"affiliation":63,"orcid":63},"Oliver","Hellwig",{"paper_id":1041,"author_seq":232,"given_name":1052,"surname":1053,"affiliation":63,"orcid":63},"Heinrich","Hettrich",{"paper_id":1041,"author_seq":218,"given_name":1055,"surname":1056,"affiliation":63,"orcid":63},"Ashutosh","Modi",{"paper_id":1041,"author_seq":203,"given_name":1058,"surname":1059,"affiliation":63,"orcid":63},"Manfred","Pinkal","The paper introduces a multi-level annotation of the Rigveda, a fundamental Sanskrit text composed in the 2. millenium BCE that is important for South-Asian and Indo-European linguistics, as well as Cultural Studies. We describe the individual annotation levels, including phonetics, morphology, lexicon, and syntax, and show how these different levels of annotation are merged to create a novel annotated corpus of Vedic Sanskrit. Vedic Sanskrit is a complex, but computationally under-resourced language. Therefore, creating this resource required considerable domain adaptation of existing computational tools, which is discussed in this paper. Because parts of the annotations are selective, we propose a bi-directional LSTM based sequential model to supplement missing verb-argument links.",{"paper_id":1062,"title":1063,"year":81,"month":855,"day":63,"doi":1064,"resource_url":1065,"first_page":63,"last_page":63,"pdf_url":1066,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1067,"paper_type":860,"authors":1068,"abstract":1090},"lrec2018-main-012","The Natural Stories Corpus","10.63317\u002F2nyub6u36pvm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-012","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F337.pdf","futrell-etal-2018-natural",[1069,1072,1075,1078,1081,1084,1087],{"paper_id":1062,"author_seq":247,"given_name":1070,"surname":1071,"affiliation":63,"orcid":63},"Richard","Futrell",{"paper_id":1062,"author_seq":232,"given_name":1073,"surname":1074,"affiliation":63,"orcid":63},"Edward","Gibson",{"paper_id":1062,"author_seq":218,"given_name":1076,"surname":1077,"affiliation":63,"orcid":63},"Harry J.","Tily",{"paper_id":1062,"author_seq":203,"given_name":1079,"surname":1080,"affiliation":63,"orcid":63},"Idan","Blank",{"paper_id":1062,"author_seq":188,"given_name":1082,"surname":1083,"affiliation":63,"orcid":63},"Anastasia","Vishnevetsky",{"paper_id":1062,"author_seq":172,"given_name":1085,"surname":1086,"affiliation":63,"orcid":63},"Steven","Piantadosi",{"paper_id":1062,"author_seq":155,"given_name":1088,"surname":1089,"affiliation":63,"orcid":63},"Evelina","Fedorenko","It is now a common practice to compare models of human language processing by comparing how well they predict behavioral and neural measures of processing difficulty, such as reading times, on corpora of rich naturalistic linguistic materials. However, many of these corpora, which are based on naturally-occurring text, do not contain many of the low-frequency syntactic constructions that are often required to distinguish between processing theories.  Here we describe a new corpus consisting of English texts edited to contain many low-frequency syntactic constructions while still sounding fluent to native speakers. The corpus is annotated with hand-corrected Penn Treebank-style parse trees and includes self-paced reading time  data and aligned audio recordings. Here we give an overview of the content of the corpus and release the data.",{"paper_id":1092,"title":1093,"year":81,"month":855,"day":63,"doi":1094,"resource_url":1095,"first_page":63,"last_page":63,"pdf_url":1096,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1097,"paper_type":860,"authors":1098,"abstract":1111},"lrec2018-main-013","Semi-automatic Korean FrameNet Annotation over KAIST Treebank","10.63317\u002F474rdab3jnfk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-013","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F356.pdf","hahm-etal-2018-semi",[1099,1102,1105,1108],{"paper_id":1092,"author_seq":247,"given_name":1100,"surname":1101,"affiliation":63,"orcid":63},"Younggyun","Hahm",{"paper_id":1092,"author_seq":232,"given_name":1103,"surname":1104,"affiliation":63,"orcid":63},"Jiseong","Kim",{"paper_id":1092,"author_seq":218,"given_name":1106,"surname":1107,"affiliation":63,"orcid":63},"Sunggoo","Kwon",{"paper_id":1092,"author_seq":203,"given_name":1109,"surname":1110,"affiliation":63,"orcid":63},"Key-Sun","Choi","This paper describes a project for constructing FrameNet annotations in Korean over the KAIST treebank corpus to scale up the Korean FrameNet resource. Annotating FrameNet over raw sentences is an expensive and complex task, because of which we have designed this project using a semi-automatic annotation approach. This paper describes the approach and its expected results. As a first step, we built a lexical database of the Korean FrameNet, and used it to learn the model for automatic annotation. Its current scope, status, and limitations are discussed in this paper.",{"paper_id":1113,"title":1114,"year":81,"month":855,"day":63,"doi":1115,"resource_url":1116,"first_page":63,"last_page":63,"pdf_url":1117,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1118,"paper_type":860,"authors":1119,"abstract":1138},"lrec2018-main-014","Handling Normalization Issues for Part-of-Speech Tagging of Online Conversational Text","10.63317\u002F2sgmsme8ccj4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-014","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F357.pdf","damnati-etal-2018-handling",[1120,1123,1126,1129,1132,1135],{"paper_id":1113,"author_seq":247,"given_name":1121,"surname":1122,"affiliation":63,"orcid":63},"Géraldine","Damnati",{"paper_id":1113,"author_seq":232,"given_name":1124,"surname":1125,"affiliation":63,"orcid":63},"Jeremy","Auguste",{"paper_id":1113,"author_seq":218,"given_name":1127,"surname":1128,"affiliation":63,"orcid":63},"Alexis","Nasr",{"paper_id":1113,"author_seq":203,"given_name":1130,"surname":1131,"affiliation":63,"orcid":63},"Delphine","Charlet",{"paper_id":1113,"author_seq":188,"given_name":1133,"surname":1134,"affiliation":63,"orcid":63},"Johannes","Heinecke",{"paper_id":1113,"author_seq":172,"given_name":1136,"surname":1137,"affiliation":63,"orcid":63},"Frédéric","Béchet","For the purpose of POS tagging noisy user-generated text, should normalization be handled as a preliminary task or is it possible to handle misspelled words directly in the POS tagging model? We propose in this paper a combined approach where some errors are normalized before tagging, while a Gated Recurrent Unit deep neural network based tagger handles the remaining errors. Word embeddings are trained on a large corpus in order to address both normalization and POS tagging.  Experiments are run on Contact Center chat conversations, a particular type of formal Computer Mediated Communication data.",{"paper_id":1140,"title":1141,"year":81,"month":855,"day":63,"doi":1142,"resource_url":1143,"first_page":63,"last_page":63,"pdf_url":1144,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1145,"paper_type":860,"authors":1146,"abstract":1174},"lrec2018-main-015","Multi-Dialect Arabic POS Tagging: A CRF Approach","10.63317\u002F2ctharew6nhx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-015","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F562.pdf","darwish-etal-2018-multi",[1147,1150,1153,1156,1159,1162,1165,1168,1171],{"paper_id":1140,"author_seq":247,"given_name":1148,"surname":1149,"affiliation":63,"orcid":63},"Kareem","Darwish",{"paper_id":1140,"author_seq":232,"given_name":1151,"surname":1152,"affiliation":63,"orcid":63},"Hamdy","Mubarak",{"paper_id":1140,"author_seq":218,"given_name":1154,"surname":1155,"affiliation":63,"orcid":63},"Ahmed","Abdelali",{"paper_id":1140,"author_seq":203,"given_name":1157,"surname":1158,"affiliation":63,"orcid":63},"Mohamed","Eldesouki",{"paper_id":1140,"author_seq":188,"given_name":1160,"surname":1161,"affiliation":63,"orcid":63},"Younes","Samih",{"paper_id":1140,"author_seq":172,"given_name":1163,"surname":1164,"affiliation":63,"orcid":63},"Randah","Alharbi",{"paper_id":1140,"author_seq":155,"given_name":1166,"surname":1167,"affiliation":63,"orcid":63},"Mohammed","Attia",{"paper_id":1140,"author_seq":138,"given_name":1169,"surname":1170,"affiliation":63,"orcid":63},"Walid","Magdy",{"paper_id":1140,"author_seq":121,"given_name":1172,"surname":1173,"affiliation":63,"orcid":63},"Laura","Kallmeyer","This paper introduces a new dataset of POS-tagged Arabic tweets in four major dialects along with tagging guidelines. The data, which we are releasing publicly, includes tweets in Egyptian, Levantine, Gulf, and Maghrebi, with 350 tweets for each dialect with appropriate train\u002Ftest\u002Fdevelopment splits for 5-fold cross validation. We use a Conditional Random Fields (CRF) sequence labeler to train POS taggers for each dialect and examine the effect of cross and joint dialect training, and give benchmark results for the datasets. Using clitic n-grams, clitic metatypes, and stem templates as features, we were able to train a joint model that can correctly tag four different dialects with an average accuracy of 89.3%.",{"paper_id":1176,"title":1177,"year":81,"month":855,"day":63,"doi":1178,"resource_url":1179,"first_page":63,"last_page":63,"pdf_url":1180,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1181,"paper_type":860,"authors":1182,"abstract":1189},"lrec2018-main-016","A Corpus for Modeling Word Importance in Spoken Dialogue Transcripts","10.63317\u002F28r4h7tnjefk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-016","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F187.pdf","kafle-huenerfauth-2018-corpus",[1183,1186],{"paper_id":1176,"author_seq":247,"given_name":1184,"surname":1185,"affiliation":63,"orcid":63},"Sushant","Kafle",{"paper_id":1176,"author_seq":232,"given_name":1187,"surname":1188,"affiliation":63,"orcid":63},"Matt","Huenerfauth","Motivated by a project to create a system for people who are deaf or hard-of-hearing that would use automatic speech recognition (ASR) to produce real-time text captions of spoken English during in-person meetings with hearing individuals, we have augmented a transcript of the Switchboard conversational dialogue corpus with an overlay of word-importance annotations, with a numeric score for each word, to indicate its importance to the meaning of each dialogue turn. Further, we demonstrate the utility of this corpus by training an automatic word importance labeling model; our best performing model has an F-score of 0.60 in an ordinal 6-class word-importance classification task with an agreement (concordance correlation coefficient) of 0.839 with the human annotators (agreement score between annotators is 0.89). Finally, we discuss our intended future applications of this resource, particularly for the task of evaluating ASR performance, i.e. creating metrics that predict ASR-output caption text usability for DHH users better thanWord Error Rate (WER).",{"paper_id":1191,"title":1192,"year":81,"month":855,"day":63,"doi":1193,"resource_url":1194,"first_page":63,"last_page":63,"pdf_url":1195,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1196,"paper_type":860,"authors":1197,"abstract":1234},"lrec2018-main-017","Dialogue Structure Annotation for Multi-Floor Interaction","10.63317\u002F3wxbwgsinirw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-017","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F672.pdf","traum-etal-2018-dialogue",[1198,1201,1204,1207,1210,1213,1216,1219,1222,1225,1228,1231],{"paper_id":1191,"author_seq":247,"given_name":1199,"surname":1200,"affiliation":63,"orcid":63},"David","Traum",{"paper_id":1191,"author_seq":232,"given_name":1202,"surname":1203,"affiliation":63,"orcid":63},"Cassidy","Henry",{"paper_id":1191,"author_seq":218,"given_name":1205,"surname":1206,"affiliation":63,"orcid":63},"Stephanie","Lukin",{"paper_id":1191,"author_seq":203,"given_name":1208,"surname":1209,"affiliation":63,"orcid":63},"Ron","Artstein",{"paper_id":1191,"author_seq":188,"given_name":1211,"surname":1212,"affiliation":63,"orcid":63},"Felix","Gervits",{"paper_id":1191,"author_seq":172,"given_name":1214,"surname":1215,"affiliation":63,"orcid":63},"Kimberly","Pollard",{"paper_id":1191,"author_seq":155,"given_name":1217,"surname":1218,"affiliation":63,"orcid":63},"Claire","Bonial",{"paper_id":1191,"author_seq":138,"given_name":1220,"surname":1221,"affiliation":63,"orcid":63},"Su","Lei",{"paper_id":1191,"author_seq":121,"given_name":1223,"surname":1224,"affiliation":63,"orcid":63},"Clare","Voss",{"paper_id":1191,"author_seq":104,"given_name":1226,"surname":1227,"affiliation":63,"orcid":63},"Matthew","Marge",{"paper_id":1191,"author_seq":87,"given_name":1229,"surname":1230,"affiliation":63,"orcid":63},"Cory","Hayes",{"paper_id":1191,"author_seq":73,"given_name":1232,"surname":1233,"affiliation":63,"orcid":63},"Susan","Hill","We present an annotation scheme for meso-level dialogue structure, specifically designed for multi-floor dialogue. The scheme includes a transaction unit that clusters utterances from multiple participants and floors into units according to realization of an initiator’s intent, and relations between individual utterances within the unit. We apply this scheme to annotate a corpus of multi-floor human-robot interaction dialogues. We examine the patterns of structure observed in these dialogues and present inter-annotator statistics and relative frequencies of types of relations and transaction units. Finally, some example applications of these annotations are introduced.",{"paper_id":1236,"title":1237,"year":81,"month":855,"day":63,"doi":1238,"resource_url":1239,"first_page":63,"last_page":63,"pdf_url":1240,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1241,"paper_type":860,"authors":1242,"abstract":1253},"lrec2018-main-018","Effects of Gender Stereotypes on Trust and Likability in Spoken Human-Robot Interaction","10.63317\u002F3eafaj49u3f7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-018","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F824.pdf","kraus-etal-2018-effects",[1243,1246,1247,1250],{"paper_id":1236,"author_seq":247,"given_name":1244,"surname":1245,"affiliation":63,"orcid":63},"Matthias","Kraus",{"paper_id":1236,"author_seq":232,"given_name":1133,"surname":1245,"affiliation":63,"orcid":63},{"paper_id":1236,"author_seq":218,"given_name":1248,"surname":1249,"affiliation":63,"orcid":63},"Martin","Baumann",{"paper_id":1236,"author_seq":203,"given_name":1251,"surname":1252,"affiliation":63,"orcid":63},"Wolfgang","Minker","As robots enter more and more areas of everyday life, it becomes necessary for them to interact in an understandable and trustworthy way. In many regards this requires a human-like interaction pattern. This research investigates the influence of gender stereotypes on trust and likability of humanoid robots. In this endeavor, explicit (name and voice) and implicit gender (personality) of robots have been manipulated along with the stereotypicality of a task. 40 participants interacted with a NAO robot to gain feedback on a task they were working on and rated the perception of the robot cooperation partner. While no gender stereotypes were found for the explicit gender, implicit gender showed a strong effect on trust and likability in the stereotypical male task. Participants trusted the male robot more and rated it as more reliable and competent than the female personality robot, while the female robot was perceived as more likable. These findings indicate that for gender stereotypes in robot interaction a differentiation between explicit and implicit stereotypical features have to be drawn and that the task context needs consideration. Future research may look into situational variables that drive stereotypification in human-robot interaction.",{"paper_id":1255,"title":1256,"year":81,"month":855,"day":63,"doi":1257,"resource_url":1258,"first_page":63,"last_page":63,"pdf_url":1259,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1260,"paper_type":860,"authors":1261,"abstract":1286},"lrec2018-main-019","A Multimodal Corpus for Mutual Gaze and Joint Attention in Multiparty Situated Interaction","10.63317\u002F4ewj3auq84wm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-019","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F987.pdf","kontogiorgos-etal-2018-multimodal",[1262,1265,1268,1271,1274,1277,1280,1283],{"paper_id":1255,"author_seq":247,"given_name":1263,"surname":1264,"affiliation":63,"orcid":63},"Dimosthenis","Kontogiorgos",{"paper_id":1255,"author_seq":232,"given_name":1266,"surname":1267,"affiliation":63,"orcid":63},"Vanya","Avramova",{"paper_id":1255,"author_seq":218,"given_name":1269,"surname":1270,"affiliation":63,"orcid":63},"Simon","Alexanderson",{"paper_id":1255,"author_seq":203,"given_name":1272,"surname":1273,"affiliation":63,"orcid":63},"Patrik","Jonell",{"paper_id":1255,"author_seq":188,"given_name":1275,"surname":1276,"affiliation":63,"orcid":63},"Catharine","Oertel",{"paper_id":1255,"author_seq":172,"given_name":1278,"surname":1279,"affiliation":63,"orcid":63},"Jonas","Beskow",{"paper_id":1255,"author_seq":155,"given_name":1281,"surname":1282,"affiliation":63,"orcid":63},"Gabriel","Skantze",{"paper_id":1255,"author_seq":138,"given_name":1284,"surname":1285,"affiliation":63,"orcid":63},"Joakim","Gustafson","In this paper we present a corpus of multiparty situated interaction where participants collaborated on moving virtual objects on a large touch screen. A moderator facilitated the discussion and directed the interaction. The corpus contains recordings of a variety of multimodal data, in that we captured speech, eye gaze and gesture data using a multisensory setup (wearable eye trackers, motion capture and audio\u002Fvideo). Furthermore, in the description of the multimodal corpus, we investigate four different types of social gaze: referential gaze, joint attention, mutual gaze and gaze aversion by both perspectives of a speaker and a listener. We annotated the groups’ object references during object manipulation tasks and analysed the group’s proportional referential eye-gaze with regards to the referent object. When investigating the distributions of gaze during and before referring expressions we could corroborate the differences in time between speakers’ and listeners’ eye gaze found in earlier studies. This corpus is of particular interest to researchers who are interested in social eye-gaze patterns in turn-taking and referring language in situated multi-party interaction.",{"paper_id":1288,"title":1289,"year":81,"month":855,"day":63,"doi":1290,"resource_url":1291,"first_page":63,"last_page":63,"pdf_url":1292,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1293,"paper_type":860,"authors":1294,"abstract":1304},"lrec2018-main-020","Improving Dialogue Act Classification for Spontaneous Arabic Speech and Instant Messages at Utterance Level","10.63317\u002F4jiicsxrpcss","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-020","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1099.pdf","elmadany-etal-2018-improving",[1295,1298,1301],{"paper_id":1288,"author_seq":247,"given_name":1296,"surname":1297,"affiliation":63,"orcid":63},"AbdelRahim","Elmadany",{"paper_id":1288,"author_seq":232,"given_name":1299,"surname":1300,"affiliation":63,"orcid":63},"Sherif","Abdou",{"paper_id":1288,"author_seq":218,"given_name":1302,"surname":1303,"affiliation":63,"orcid":63},"Mervat","Gheith","The ability to model and automatically detect dialogue act is an important step toward understanding spontaneous speech and Instant Messages. However, it has been difficult to infer a dialogue act from a surface utterance because it highly depends on the context of the utterance and speaker linguistic knowledge; especially in Arabic dialects.  This paper proposes a statistical dialogue analysis model to recognize utterance’s dialogue acts using a multi-classes hierarchical structure. The model can automatically acquire probabilistic discourse knowledge from a dialogue corpus were collected and annotated manually from multi-genre Egyptian call-centers. Extensive experiments were conducted using Support Vector Machines classifier to evaluate the system performance. The results attained in the term of average F-measure scores of 0.912; showed that the proposed approach has moderately improved F-measure by approximately 20%.",{"paper_id":1306,"title":1307,"year":81,"month":855,"day":63,"doi":1308,"resource_url":1309,"first_page":63,"last_page":63,"pdf_url":1310,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1311,"paper_type":860,"authors":1312,"abstract":1322},"lrec2018-main-021","Data Management Plan (DMP) for Language Data under the New General Da-ta Protection Regulation (GDPR)","10.63317\u002F2r9x5xjqoimt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-021","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F313.pdf","kamocki-etal-2018-data",[1313,1316,1319],{"paper_id":1306,"author_seq":247,"given_name":1314,"surname":1315,"affiliation":63,"orcid":63},"Pawel","Kamocki",{"paper_id":1306,"author_seq":232,"given_name":1317,"surname":1318,"affiliation":63,"orcid":63},"Valérie","Mapelli",{"paper_id":1306,"author_seq":218,"given_name":1320,"surname":1321,"affiliation":63,"orcid":63},"Khalid","Choukri","The EU’s General Data Protection Regulation (GDPR) of 27 April 2016 will apply from 25 May 2018. It will reinforce certain principles related to the processing of personal data, which will also affect many projects in the field of Natural Language Processing. Perhaps most importantly, the GDPR will introduce the principle of accountability, according to which the data processor shall be able to demonstrate compliance with the new rules, and that he applies ‘privacy by design and by default’. In our opinion, a well-drafted Data Management Plan (DMP) is of key importance for GDPR compliance; indeed, the trend towards the adoption of a DMP, particularly in EU-funded research projects, has been more vivid since 2017, after the extension of the Horizon 2020 Open Data Pilot. Since 2015, ELRA also proposes its own template for the Data Management Plan, which is being updated to take the new law into account. In this paper, we present the new legal framework introduced by the GDPR and propose how the new rules can be integrated in the DMP in order to in-crease transparency of processing, facilitate demonstration of GDPR compliance and spread good practices within the community.",{"paper_id":1324,"title":1325,"year":81,"month":855,"day":63,"doi":1326,"resource_url":1327,"first_page":63,"last_page":63,"pdf_url":1328,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1329,"paper_type":860,"authors":1330,"abstract":1334},"lrec2018-main-022","We Are Depleting Our Research Subject as We Are Investigating It: In Language Technology, more Replication and Diversity Are Needed","10.63317\u002F4hkgqvka3hi7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-022","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F397.pdf","branco-2018-depleting",[1331],{"paper_id":1324,"author_seq":247,"given_name":1332,"surname":1333,"affiliation":63,"orcid":63},"António","Branco","In this paper, we present an analysis indicating that, in language technology, as we are investigating natural language we are contributing to deplete it in the sense that we are contributing to reduce the diversity of languages. To address this circumstance, we propose that more replication and reproduction and more language diversity need to be taken into account in our research activities.",{"paper_id":1336,"title":1337,"year":81,"month":855,"day":63,"doi":1338,"resource_url":1339,"first_page":63,"last_page":63,"pdf_url":1340,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1341,"paper_type":860,"authors":1342,"abstract":1349},"lrec2018-main-023","Lessons Learned: On the Challenges of Migrating a Research Data Repository from a Research Institution to a University Library.","10.63317\u002F36p6jeyukwkd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-023","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F504.pdf","trippel-zinn-2018-lessons",[1343,1346],{"paper_id":1336,"author_seq":247,"given_name":1344,"surname":1345,"affiliation":63,"orcid":63},"Thorsten","Trippel",{"paper_id":1336,"author_seq":232,"given_name":1347,"surname":1348,"affiliation":63,"orcid":63},"Claus","Zinn","The transfer of research data management from one institution to another infrastructural partner is all but trivial, but can be required, for instance, when an institution faces reorganisation or closure. In a case study, we describe the migration of all research data, identify the challenges we encountered, and discuss how we addressed them. It shows that the moving of research data management to another institution is a feasible, but potentially costly enterprise. Being able to demonstrate the feasibility of research data migration supports the stance of data archives that users can expect high levels of trust and reliability when it comes to data safety and sustainability.",{"paper_id":1351,"title":1352,"year":81,"month":855,"day":63,"doi":1353,"resource_url":1354,"first_page":63,"last_page":63,"pdf_url":1355,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1356,"paper_type":860,"authors":1357,"abstract":1372},"lrec2018-main-024","Introducing NIEUW: Novel Incentives and Workflows for Eliciting Linguistic Data","10.63317\u002F3fnj8rqwe4ne","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-024","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F894.pdf","cieri-etal-2018-introducing",[1358,1361,1363,1366,1369],{"paper_id":1351,"author_seq":247,"given_name":1359,"surname":1360,"affiliation":63,"orcid":63},"Christopher","Cieri",{"paper_id":1351,"author_seq":232,"given_name":1016,"surname":1362,"affiliation":63,"orcid":63},"Fiumara",{"paper_id":1351,"author_seq":218,"given_name":1364,"surname":1365,"affiliation":63,"orcid":63},"Mark","Liberman",{"paper_id":1351,"author_seq":203,"given_name":1367,"surname":1368,"affiliation":63,"orcid":63},"Chris","Callison-Burch",{"paper_id":1351,"author_seq":188,"given_name":1370,"surname":1371,"affiliation":63,"orcid":63},"Jonathan","Wright","This paper introduces the NIEUW (Novel Incentives and Workflows) project funded by the United States National Science Foundation and part of the Linguistic Data Consortium’s strategy to provide order of magnitude improvement in the scale, cost, variety, linguistic diversity and quality of Language Resources available for education, research and technology development. Notwithstanding decades of effort and progress in collecting and distributing Language Resources, it remains the case that demand still far exceeds supply for all of the approximately 7000 languages in the world, even the most well documented languages with global economic and political influence. The absence of Language Resources, regardless of the language, stifles teaching and technology building, inhibiting the creation of language enabled applications and, as a result, commerce and communication. Project oriented approaches which focus intensive funding and effort on problems of limited scope over short durations can only address part of the problem. The HLT community instead requires approaches that do not rely upon highly constrained resources such as project funding and can be sustained across many languages and many years. In this paper, we describe a new initiative to harness the power of alternative incentives to elicit linguistic data and annotation. We also describe changes to the workflows necessary to collect data from workforces attracted by these incentives.",{"paper_id":1374,"title":1375,"year":81,"month":855,"day":63,"doi":1376,"resource_url":1377,"first_page":63,"last_page":63,"pdf_url":1378,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1379,"paper_type":860,"authors":1380,"abstract":1411},"lrec2018-main-025","Three Dimensions of Reproducibility in Natural Language Processing","10.63317\u002F58wtisabky5x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-025","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F952.pdf","cohen-etal-2018-three",[1381,1384,1387,1390,1393,1396,1399,1402,1405,1408],{"paper_id":1374,"author_seq":247,"given_name":1382,"surname":1383,"affiliation":63,"orcid":63},"K. Bretonnel","Cohen",{"paper_id":1374,"author_seq":232,"given_name":1385,"surname":1386,"affiliation":63,"orcid":63},"Jingbo","Xia",{"paper_id":1374,"author_seq":218,"given_name":1388,"surname":1389,"affiliation":63,"orcid":63},"Pierre","Zweigenbaum",{"paper_id":1374,"author_seq":203,"given_name":1391,"surname":1392,"affiliation":63,"orcid":63},"Tiffany","Callahan",{"paper_id":1374,"author_seq":188,"given_name":1394,"surname":1395,"affiliation":63,"orcid":63},"Orin","Hargraves",{"paper_id":1374,"author_seq":172,"given_name":1397,"surname":1398,"affiliation":63,"orcid":63},"Foster","Goss",{"paper_id":1374,"author_seq":155,"given_name":1400,"surname":1401,"affiliation":63,"orcid":63},"Nancy","Ide",{"paper_id":1374,"author_seq":138,"given_name":1403,"surname":1404,"affiliation":63,"orcid":63},"Aurélie","Névéol",{"paper_id":1374,"author_seq":121,"given_name":1406,"surname":1407,"affiliation":63,"orcid":63},"Cyril","Grouin",{"paper_id":1374,"author_seq":104,"given_name":1409,"surname":1410,"affiliation":63,"orcid":63},"Lawrence E.","Hunter","Despite considerable recent attention to problems with reproducibility of scientific research, there is a striking lack of agreement about the definition of the term. That is a problem, because the lack of a consensus definition makes it difficult to compare studies of reproducibility, and thus to have even a broad overview of the state of the issue in natural language processing. This paper proposes an ontology of reproducibility in that field. Its goal is to enhance both future research and communication about the topic, and retrospective meta-analyses. We show that three dimensions of reproducibility, corresponding to three kinds of claims in natural language processing papers, can account for a variety of types of research reports. These dimensions are reproducibility of a conclusion, of a finding, and of a value. Three biomedical natural language processing papers by the authors of this paper are analyzed with respect to these dimensions.",{"paper_id":1413,"title":1414,"year":81,"month":855,"day":63,"doi":1415,"resource_url":1416,"first_page":63,"last_page":63,"pdf_url":1417,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1418,"paper_type":860,"authors":1419,"abstract":1426},"lrec2018-main-026","Content-Based Conflict of Interest Detection on Wikipedia","10.63317\u002F2s27hupiyzfx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-026","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F256.pdf","orizu-he-2018-content",[1420,1423],{"paper_id":1413,"author_seq":247,"given_name":1421,"surname":1422,"affiliation":63,"orcid":63},"Udochukwu","Orizu",{"paper_id":1413,"author_seq":232,"given_name":1424,"surname":1425,"affiliation":63,"orcid":63},"Yulan","He","Wikipedia is one of the most visited websites in the world. On Wikipedia, Conflict-of-Interest (CoI) editing happens when an editor uses Wikipedia to advance their interests or relationships. This includes paid editing done by organisations for public relations purposes, etc. CoI detection is highly subjective and though closely related to vandalism and bias detection, it is a more difficult problem. In this paper, we frame CoI detection as a binary classification problem and explore various features which can be used to train supervised classifiers for CoI detection on Wikipedia articles. Our experimental results show that the best F-measure achieved is 0.67 by training SVM from a combination of features including stylometric, bias and emotion features. As we are not certain that our non-CoI set does not contain any CoI articles, we have also explored the use of one-class classification for CoI detection. The results show that using stylometric features outperforms other types of features or a combination of them and gives an F-measure of 0.63. Also, while binary classifiers give higher recall values (0.81~0.94), one-class classifier attains higher precision values (0.69~0.74).",{"paper_id":1428,"title":1429,"year":81,"month":855,"day":63,"doi":1430,"resource_url":1431,"first_page":63,"last_page":63,"pdf_url":1432,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1433,"paper_type":860,"authors":1434,"abstract":1438},"lrec2018-main-027","Word Affect Intensities","10.63317\u002F58igjzkfg9k5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-027","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F329.pdf","mohammad-2018-word",[1435],{"paper_id":1428,"author_seq":247,"given_name":1436,"surname":1437,"affiliation":63,"orcid":63},"Saif","Mohammad","Words often convey affect---emotions, feelings, and attitudes. Further, different words can convey affect to various degrees (intensities). However, existing manually created lexicons for basic emotions (such as anger and fear)  indicate only coarse categories of affect association (for example, associated with anger or not associated with anger). Automatic lexicons of affect provide fine degrees of association, but they tend not to be accurate as human-created lexicons. Here, for the first time, we present a manually created affect intensity lexicon with real-valued scores of intensity for four basic emotions: anger, fear, joy, and sadness.  (We will subsequently add entries for more emotions such as disgust, anticipation, trust, and surprise.) We refer to this dataset as the {\\it NRC Affect Intensity Lexicon}, or {\\it AIL} for short. AIL has entries for close to 6,000 English words.  We used a technique called best--worst scaling (BWS) to create the lexicon. BWS improves annotation consistency and obtains reliable fine-grained scores (split-half reliability $> 0.91$). We also compare the entries in AIL with the entries in the {\\it NRC VAD Lexicon}, which has valence, arousal, and dominance (VAD) scores for 20K English words. We find that anger, fear, and sadness words, on average, have very similar VAD scores. However, sadness words tend to have slightly lower dominance scores than fear and anger words.  The Affect Intensity Lexicon has applications in automatic emotion analysis in a number of domains such as commerce, education, intelligence, and public health. AIL is also useful in the building of natural language generation systems.",{"paper_id":1440,"title":1441,"year":81,"month":855,"day":63,"doi":1442,"resource_url":1443,"first_page":63,"last_page":63,"pdf_url":1444,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1445,"paper_type":860,"authors":1446,"abstract":1453},"lrec2018-main-028","Representation Mapping: A Novel Approach to Generate High-Quality Multi-Lingual Emotion Lexicons","10.63317\u002F22zt9dqqdya4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-028","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F402.pdf","buechel-hahn-2018-representation",[1447,1450],{"paper_id":1440,"author_seq":247,"given_name":1448,"surname":1449,"affiliation":63,"orcid":63},"Sven","Buechel",{"paper_id":1440,"author_seq":232,"given_name":1451,"surname":1452,"affiliation":63,"orcid":63},"Udo","Hahn","In the past years, sentiment analysis has increasingly shifted attention to representational frameworks more expressive than semantic polarity (being positive, negative or neutral). However, these richer formats (like Basic Emotions or Valence-Arousal-Dominance, and variants therefrom), rooted in psychological research, tend to proliferate the number of representation schemes for emotion encoding. Thus, a large amount of representationally incompatible emotion lexicons has been developed by various research groups adopting one or the other emotion representation format. As a consequence, the reusability of these resources decreases as does the comparability of systems using them. In this paper, we propose to solve this dilemma by methods and tools which map different representation formats onto each other for the sake of mutual compatibility and interoperability of language resources. We present the first large-scale investigation of such representation mappings for four typologically diverse languages and find evidence that our approach produces (near-)gold quality emotion lexicons, even in crosslingual settings. Finally, we use our models to create new lexicons for eight typologically diverse languages.",{"paper_id":1455,"title":1456,"year":81,"month":855,"day":63,"doi":1457,"resource_url":1458,"first_page":63,"last_page":63,"pdf_url":1459,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1460,"paper_type":860,"authors":1461,"abstract":1479},"lrec2018-main-029","Unfolding the External Behavior and Inner Affective State of Teammates through Ensemble Learning: Experimental Evidence from a Dyadic Team Corpus","10.63317\u002F2hp2kwjtyxoc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-029","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F838.pdf","vlachostergiou-etal-2018-unfolding",[1462,1465,1467,1470,1473,1476],{"paper_id":1455,"author_seq":247,"given_name":1463,"surname":1464,"affiliation":63,"orcid":63},"Aggeliki","Vlachostergiou",{"paper_id":1455,"author_seq":232,"given_name":1364,"surname":1466,"affiliation":63,"orcid":63},"Dennison",{"paper_id":1455,"author_seq":218,"given_name":1468,"surname":1469,"affiliation":63,"orcid":63},"Catherine","Neubauer",{"paper_id":1455,"author_seq":203,"given_name":1471,"surname":1472,"affiliation":63,"orcid":63},"Stefan","Scherer",{"paper_id":1455,"author_seq":188,"given_name":1474,"surname":1475,"affiliation":63,"orcid":63},"Peter","Khooshabeh",{"paper_id":1455,"author_seq":172,"given_name":1477,"surname":1478,"affiliation":63,"orcid":63},"Andre","Harrison","The current study was motivated to understand the relationship between the external behavior and inner affective state of two team members (``instructor\"-``defuser\") during a demanding operational task (i.e., bomb defusion). In this study we assessed team member's verbal responses (i.e., length of duration) in relation to their external as well as internal affective cues. External behavioral cues include defuser's verbal expressions while inner cues are based on physiological signals. More specifically, we differentiate between ``defusers'\" physiological patterns occurring after the ``instructor's\" turns according to whether they belong to a short or a long turn-taking response interval. Based on the assumption that longer turn-taking behaviors are likely to be caused by demanding cognitive task events and\u002For stressful interactions, we hypothesize that inner mechanisms produced in these intense affective activity intervals will be reflected on defuser's physiology. A dyadic team corpus was used to examine the association between the ``defusers\" physiological signals following the ``instructor's\" questions to predict whether they occurred in a short or long turn-taking period of time. The results suggest that an association does exist between turn taking and inner affective state. Additionally, it was our goal to further unpack this association by creating diverse ensembles. As such, we studied various base learners and different ensemble sizes to determine the best approach towards building a stable diverse ensemble that generalizes well on the external and inner cues of individuals.",{"paper_id":1481,"title":1482,"year":81,"month":855,"day":63,"doi":1483,"resource_url":1484,"first_page":63,"last_page":63,"pdf_url":1485,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1486,"paper_type":860,"authors":1487,"abstract":1492},"lrec2018-main-030","Understanding Emotions: A Dataset of Tweets to Study Interactions between Affect Categories","10.63317\u002F5or8sy3gqobf","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-030","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F957.pdf","mohammad-kiritchenko-2018-understanding",[1488,1489],{"paper_id":1481,"author_seq":247,"given_name":1436,"surname":1437,"affiliation":63,"orcid":63},{"paper_id":1481,"author_seq":232,"given_name":1490,"surname":1491,"affiliation":63,"orcid":63},"Svetlana","Kiritchenko","Human emotions are complex and nuanced.  Yet, an overwhelming majority of the work in automatically detecting emotions from text has focused only on classifying text into positive, negative, and neutral classes. Our goal is to create a single textual dataset that is annotated for many emotion dimensions (from both the basic emotion model and the VAD model). For each emotion dimension, we annotate tweets for not just coarse classes (such as anger or no anger) but also for fine-grained real-valued scores indicating the intensity of emotion.  We use Best-Worst Scaling to address the limitations of traditional rating scale methods such as inter- and intra-annotator inconsistency. We show that the fine-grained intensity scores thus obtained are reliable. The new dataset is useful for training and testing supervised machine learning algorithms for multi-label emotion classification, emotion intensity regression, detecting valence, detecting ordinal class of intensity of emotion (slightly sad, very angry, etc.), and detecting ordinal class of valence. The dataset also sheds light on crucial research questions such as:  which emotions often present together in tweets?; how do the intensities of the three negative emotions relate to each other?; and how do the intensities of the basic emotions relate to valence?",{"paper_id":1494,"title":1495,"year":81,"month":855,"day":63,"doi":1496,"resource_url":1497,"first_page":63,"last_page":63,"pdf_url":1498,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1499,"paper_type":860,"authors":1500,"abstract":1513},"lrec2018-main-031","When ACE met KBP: End-to-End Evaluation of Knowledge Base Population with Component-level Annotation","10.63317\u002F4hsaibscw5zv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-031","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F161.pdf","min-etal-2018-ace",[1501,1504,1507,1510],{"paper_id":1494,"author_seq":247,"given_name":1502,"surname":1503,"affiliation":63,"orcid":63},"Bonan","Min",{"paper_id":1494,"author_seq":232,"given_name":1505,"surname":1506,"affiliation":63,"orcid":63},"Marjorie","Freedman",{"paper_id":1494,"author_seq":218,"given_name":1508,"surname":1509,"affiliation":63,"orcid":63},"Roger","Bock",{"paper_id":1494,"author_seq":203,"given_name":1511,"surname":1512,"affiliation":63,"orcid":63},"Ralph","Weischedel","Building a Knowledge Base from text corpora is useful for many applications such as question answering and web search. Since 2012, the Cold Start Knowledge Base Population (KBP) evaluation at the Text Analysis Conference (TAC) has attracted many participants. Despite the popularity, the Cold Start KBP evaluation has several problems including but not limited to the following two: first, each year’s assessment dataset is a pooled set of query-answer pairs, primarily generated by participating systems. It is well known to participants that there is pooling bias: a system developed outside of the official evaluation period is not rewarded for finding novel answers, but rather is penalized for doing so. Second, the assessment dataset, constructed with lots of human effort, offers little help in training information extraction algorithms which are crucial ingredients for the end-to-end KBP task. To address these problems, we propose a new unbiased evaluation methodology that uses existing component-level annotation such as the Automatic Content Extraction (ACE) dataset, to evaluate Cold Start KBP. We also propose bootstrap resampling to provide statistical significance to the results reported. We will then present experimental results and analysis.",{"paper_id":1515,"title":1516,"year":81,"month":855,"day":63,"doi":1517,"resource_url":1518,"first_page":63,"last_page":63,"pdf_url":1519,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1520,"paper_type":860,"authors":1521,"abstract":1528},"lrec2018-main-032","Simple Large-scale Relation Extraction from Unstructured Text","10.63317\u002F44osfsqw99tu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-032","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F7.pdf","christodoulopoulos-mittal-2018-simple",[1522,1525],{"paper_id":1515,"author_seq":247,"given_name":1523,"surname":1524,"affiliation":63,"orcid":63},"Christos","Christodoulopoulos",{"paper_id":1515,"author_seq":232,"given_name":1526,"surname":1527,"affiliation":63,"orcid":63},"Arpit","Mittal","Knowledge-based question answering relies on the availability of facts, the majority of which cannot be found in structured sources (e.g. Wikipedia info-boxes, Wikidata). One of the major components of extracting facts from unstructured text is Relation Extraction (RE). In this paper we propose a novel method for creating distant (weak) supervision labels for training a large-scale RE system. We also provide new evidence about the effectiveness of neural network approaches by decoupling the model architecture from the feature design of a state-of-the-art neural network system. Surprisingly, a much simpler classifier trained on similar features performs on par with the highly complex neural network system (at 75x reduction to the training time), suggesting that the features are a bigger contributor to the final performance.",{"paper_id":1530,"title":1531,"year":81,"month":855,"day":63,"doi":1532,"resource_url":1533,"first_page":63,"last_page":63,"pdf_url":1534,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1535,"paper_type":860,"authors":1536,"abstract":1542},"lrec2018-main-033","Joint Learning of Sense and Word Embeddings","10.63317\u002F4dhzicjoev9u","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-033","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F393.pdf","alsuhaibani-bollegala-2018-joint",[1537,1539],{"paper_id":1530,"author_seq":247,"given_name":1166,"surname":1538,"affiliation":63,"orcid":63},"Alsuhaibani",{"paper_id":1530,"author_seq":232,"given_name":1540,"surname":1541,"affiliation":63,"orcid":63},"Danushka","Bollegala","Methods for learning lower-dimensional representations (embeddings) of words using unlabelled data have received a renewed interested due to their myriad success in various Natural Language Processing (NLP) tasks. However, despite their success, a common deficiency associated with most word embedding learning methods is that they learn a single representation for a word, ignoring the different senses of that word (polysemy). To address the polysemy problem, we propose a method that jointly learns sense-aware word embeddings using both unlabelled and sense-tagged text corpora. In particular, our proposed method can learn both word and sense embeddings by efficiently exploiting both types of resources. Our quantitative and qualitative experimental results using unlabelled text corpus with (a) manually annotated word senses, and (b) pseudo annotated senses demonstrate that the proposed method can correctly learn the multiple senses of an ambiguous word. Moreover, the word embeddings learnt by our proposed method outperform several previously proposed competitive word embedding learning methods on word similarity and short-text classification benchmark datasets.",{"paper_id":1544,"title":1545,"year":81,"month":855,"day":63,"doi":1546,"resource_url":1547,"first_page":63,"last_page":63,"pdf_url":1548,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1549,"paper_type":860,"authors":1550,"abstract":1556},"lrec2018-main-034","Comparing Pretrained Multilingual Word Embeddings on an Ontology Alignment Task","10.63317\u002F595h4gjdxwfr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-034","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F917.pdf","gromann-declerck-2018-comparing",[1551,1554],{"paper_id":1544,"author_seq":247,"given_name":1552,"surname":1553,"affiliation":63,"orcid":63},"Dagmar","Gromann",{"paper_id":1544,"author_seq":232,"given_name":881,"surname":1555,"affiliation":63,"orcid":63},"Declerck","Word embeddings capture a string's semantics and go beyond its surface form. In a multilingual environment, those embeddings need to be trained for each language, either separately or as a joint model. The more languages needed, the more computationally cost- and time-intensive the task of training. As an alternative, pretrained word embeddings can be utilized to compute semantic similarities of strings in different languages. This paper provides a comparison of three different multilingual pretrained word embedding repositories with a string-matching baseline and uses the task of ontology alignment as example scenario. A vast majority of ontology alignment methods rely on string similarity metrics, however, they frequently use string matching techniques that purely rely on syntactic aspects. Semantically oriented word embeddings have much to offer to ontology alignment algorithms, such as the simple Munkres algorithm utilized in this paper. The proposed approach produces a number of correct alignments on a non-standard data set based on embeddings from the three repositories, where FastText embeddings performed best on all four languages and clearly outperformed the string-matching baseline.",{"paper_id":1558,"title":1559,"year":81,"month":855,"day":63,"doi":1560,"resource_url":1561,"first_page":63,"last_page":63,"pdf_url":1562,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1563,"paper_type":860,"authors":1564,"abstract":1574},"lrec2018-main-035","A Large Resource of Patterns for Verbal Paraphrases","10.63317\u002F3zszfrq5vp3g","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-035","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1118.pdf","popescu-etal-2018-large",[1565,1568,1571],{"paper_id":1558,"author_seq":247,"given_name":1566,"surname":1567,"affiliation":63,"orcid":63},"Octavian","Popescu",{"paper_id":1558,"author_seq":232,"given_name":1569,"surname":1570,"affiliation":63,"orcid":63},"Ngoc Phuoc An","Vo",{"paper_id":1558,"author_seq":218,"given_name":1572,"surname":1573,"affiliation":63,"orcid":63},"Vadim","Sheinin","Paraphrases play an important role in natural language understanding, especially because there are fluent jumps between hidden paraphrases in a text. For example, even to get to the meaning of a simple dialog like I bought a computer. How much did the computer cost? involves quite a few steps. A computational system may actually have a huge problem in linking the two sentences as their connection is not overtly present in the text. However, it becomes easier if it has access to the following paraphrases: [HUMAN] buy [ARTIFACT] () [HUMAN] pay [PRICE] for [ARTIFACT] () [ARTIFACT] cost [HUMAN] [PRICE], and also to the information that I IsA [HUMAN] and computer IsA [ARTIFACT]. In this paper we introduce a resource of such paraphrases that was extracted by investigating large corpora in an unsupervised manner. The resource contains tens of thousands of such pairs and it is available for academic purposes.",{"paper_id":1576,"title":1577,"year":81,"month":855,"day":63,"doi":1578,"resource_url":1579,"first_page":63,"last_page":63,"pdf_url":1580,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1581,"paper_type":860,"authors":1582,"abstract":1592},"lrec2018-main-036","Building Parallel Monolingual Gan Chinese Dialects Corpus","10.63317\u002F3o83gfft44d7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-036","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F27.pdf","xu-etal-2018-building",[1583,1586,1589],{"paper_id":1576,"author_seq":247,"given_name":1584,"surname":1585,"affiliation":63,"orcid":63},"Fan","Xu",{"paper_id":1576,"author_seq":232,"given_name":1587,"surname":1588,"affiliation":63,"orcid":63},"Mingwen","Wang",{"paper_id":1576,"author_seq":218,"given_name":1590,"surname":1591,"affiliation":63,"orcid":63},"Maoxi","Li","Automatic language identification of an input sentence or a text written in similar languages, varieties or dialects is an important task in natural language processing. In this paper, we propose a scheme to represent Gan (Jiangxi province of China) Chinese dialects. In particular, it is a two-level and fine-grained representation using Chinese character, Chinese Pinyin and Chinese audio forms. Guided by the scheme, we manually annotate a Gan Chinese Dialects Corpus (GCDC) including 131.5 hours and 310 documents with 6 different genres, containing news, official document, story, prose, poet, letter and speech, from 19 different Gan regions. In addition, the preliminary evaluation on 2-way, 7-way and 20-way sentence-level Gan Chinese Dialects Identification (GCDI) justifies the appropriateness of the scheme to Gan Chinese dialects analysis and the usefulness of our manually annotated GCDC.",{"paper_id":1594,"title":1595,"year":81,"month":855,"day":63,"doi":1596,"resource_url":1597,"first_page":63,"last_page":63,"pdf_url":1598,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1599,"paper_type":860,"authors":1600,"abstract":1628},"lrec2018-main-037","A Recorded Debating Dataset","10.63317\u002F26z3obpfbq7h","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-037","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F66.pdf","mirkin-etal-2018-recorded",[1601,1604,1607,1610,1613,1616,1619,1622,1625],{"paper_id":1594,"author_seq":247,"given_name":1602,"surname":1603,"affiliation":63,"orcid":63},"Shachar","Mirkin",{"paper_id":1594,"author_seq":232,"given_name":1605,"surname":1606,"affiliation":63,"orcid":63},"Michal","Jacovi",{"paper_id":1594,"author_seq":218,"given_name":1608,"surname":1609,"affiliation":63,"orcid":63},"Tamar","Lavee",{"paper_id":1594,"author_seq":203,"given_name":1611,"surname":1612,"affiliation":63,"orcid":63},"Hong-Kwang","Kuo",{"paper_id":1594,"author_seq":188,"given_name":1614,"surname":1615,"affiliation":63,"orcid":63},"Samuel","Thomas",{"paper_id":1594,"author_seq":172,"given_name":1617,"surname":1618,"affiliation":63,"orcid":63},"Leslie","Sager",{"paper_id":1594,"author_seq":155,"given_name":1620,"surname":1621,"affiliation":63,"orcid":63},"Lili","Kotlerman",{"paper_id":1594,"author_seq":138,"given_name":1623,"surname":1624,"affiliation":63,"orcid":63},"Elad","Venezian",{"paper_id":1594,"author_seq":121,"given_name":1626,"surname":1627,"affiliation":63,"orcid":63},"Noam","Slonim","This paper describes an English audio and textual dataset of debating speeches, a unique resource for the growing research field of computational argumentation and debating technologies. We detail the process of speech recording by professional debaters, the transcription of the speeches with an Automatic Speech Recognition (ASR) system, their consequent automatic processing to produce a text that is more \"NLP-friendly\", and in parallel -- the manual transcription of the speeches in order to produce gold-standard \"reference\" transcripts. We release 60 speeches on various controversial topics, each in five formats corresponding to the different stages in the production of the data. The intention is to allow utilizing this resource for multiple research purposes, be it the addition of in-domain training data for a debate-specific ASR system, or applying argumentation mining on either noisy or clean debate transcripts. We intend to make further releases of this data in the future.",{"paper_id":1630,"title":1631,"year":81,"month":855,"day":63,"doi":1632,"resource_url":1633,"first_page":63,"last_page":63,"pdf_url":1634,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1635,"paper_type":860,"authors":1636,"abstract":1646},"lrec2018-main-038","Building a Corpus from Handwritten Picture Postcards: Transcription, Annotation and Part-of-Speech Tagging","10.63317\u002F4vyyyyb2vxmw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-038","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F193.pdf","sugisaki-etal-2018-building",[1637,1640,1643],{"paper_id":1630,"author_seq":247,"given_name":1638,"surname":1639,"affiliation":63,"orcid":63},"Kyoko","Sugisaki",{"paper_id":1630,"author_seq":232,"given_name":1641,"surname":1642,"affiliation":63,"orcid":63},"Nicolas","Wiedmer",{"paper_id":1630,"author_seq":218,"given_name":1644,"surname":1645,"affiliation":63,"orcid":63},"Heiko","Hausendorf","In this paper, we present a corpus of over 11,000  holiday picture postcards written in German and Swiss German. The postcards have been collected for the purpose of text-linguistic investigations on the genre and its standardisation and variation over time. We discuss the processes and challenges of digitalisation, manual transcription, and manual annotation. In addition, we developed our own automatic text segmentation system and a part-of-speech tagger, since our texts often contain orthographic deviations, domain-specific structures such as fragments, subject-less sentences, interjections, discourse particles, and domain-specific formulaic communicative routines in salutation and greeting. In particular, we demonstrate that the CRF-based POS tagger could be boosted to a domain-specific text by adding a small amount of in-domain data. We showed that entropy-based training data sampling was competitive with random sampling in performing this task. The evaluation showed that our POS tagger achieved a F1 score of 0.93 (precision 0.94, recall 0.93), which outperformed a state-of-the-art POS tagger.",{"paper_id":1648,"title":1649,"year":81,"month":855,"day":63,"doi":1650,"resource_url":1651,"first_page":63,"last_page":63,"pdf_url":1652,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1653,"paper_type":860,"authors":1654,"abstract":1666},"lrec2018-main-039","A Lexical Tool for Academic Writing in Spanish based on Expert and Novice Corpora","10.63317\u002F56dkipykej3b","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-039","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F769.pdf","garcia-salido-etal-2018-lexical",[1655,1658,1660,1663],{"paper_id":1648,"author_seq":247,"given_name":1656,"surname":1657,"affiliation":63,"orcid":63},"Marcos","García Salido",{"paper_id":1648,"author_seq":232,"given_name":1656,"surname":1659,"affiliation":63,"orcid":63},"García",{"paper_id":1648,"author_seq":218,"given_name":1661,"surname":1662,"affiliation":63,"orcid":63},"Milka","Villayandre-Llamazares",{"paper_id":1648,"author_seq":203,"given_name":1664,"surname":1665,"affiliation":63,"orcid":63},"Margarita","Alonso-Ramos","The object of this article is to describe the extraction of data from a corpus of academic texts in Spanish and the use of those data for developing a lexical tool oriented to the production of academic texts. The corpus provides the lexical combinations that will be included in the afore-mentioned tool, namely collocations, idioms and formulas. They have been retrieved from the corpus controlling for their keyness (i.e., their specificity with regard to academic texts) and their even distribution across the corpus. For the extraction of collocations containing academic vocabulary other methods have been used, taking advantage of the morphological and syntactic information with which the corpus has been enriched. In the case of collocations and other multiword units, several association measures are being tested in order to restrict the list of candidates the lexicographers will have to deal with manually.",{"paper_id":1668,"title":1669,"year":81,"month":855,"day":63,"doi":1670,"resource_url":1671,"first_page":63,"last_page":63,"pdf_url":1672,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1673,"paper_type":860,"authors":1674,"abstract":1690},"lrec2018-main-040","Framing Named Entity Linking Error Types","10.63317\u002F52h56z532s75","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-040","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F612.pdf","brasoveanu-etal-2018-framing",[1675,1678,1681,1684,1687],{"paper_id":1668,"author_seq":247,"given_name":1676,"surname":1677,"affiliation":63,"orcid":63},"Adrian","Braşoveanu",{"paper_id":1668,"author_seq":232,"given_name":1679,"surname":1680,"affiliation":63,"orcid":63},"Giuseppe","Rizzo",{"paper_id":1668,"author_seq":218,"given_name":1682,"surname":1683,"affiliation":63,"orcid":63},"Philipp","Kuntschik",{"paper_id":1668,"author_seq":203,"given_name":1685,"surname":1686,"affiliation":63,"orcid":63},"Albert","Weichselbraun",{"paper_id":1668,"author_seq":188,"given_name":1688,"surname":1689,"affiliation":63,"orcid":63},"Lyndon J.B.","Nixon","Named  Entity  Linking        (NEL)  and  relation  extraction  forms  the  backbone  of  Knowledge  Base  Population  tasks.   The  recent  rise  oflarge open source Knowledge Bases and the continuous focus on improving NEL performance has led to the creation of automatedbenchmark  solutions  during  the  last  decade.   The  benchmarking  of  NEL  systems  offers  a  valuable  approach  to  understand        a  NELsystem’s performance quantitatively.  However, an in-depth qualitative analysis that helps improving NEL methods by identifying errorcauses  usually  requires        a  more  thorough  error  analysis.   This  paper  proposes  a  taxonomy  to  frame  common  errors  and  applies  thistaxonomy in a survey study to assess the performance of four well-known Named Entity Linking systems on three recent gold standards.",{"paper_id":1692,"title":1693,"year":81,"month":855,"day":63,"doi":1694,"resource_url":1695,"first_page":63,"last_page":63,"pdf_url":1696,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1697,"paper_type":860,"authors":1698,"abstract":1711},"lrec2018-main-041","A FrameNet for Cancer Information in Clinical Narratives: Schema and Annotation","10.63317\u002F5ngfzz8v9a2p","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-041","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F37.pdf","roberts-etal-2018-framenet",[1699,1702,1705,1708],{"paper_id":1692,"author_seq":247,"given_name":1700,"surname":1701,"affiliation":63,"orcid":63},"Kirk","Roberts",{"paper_id":1692,"author_seq":232,"given_name":1703,"surname":1704,"affiliation":63,"orcid":63},"Yuqi","Si",{"paper_id":1692,"author_seq":218,"given_name":1706,"surname":1707,"affiliation":63,"orcid":63},"Anshul","Gandhi",{"paper_id":1692,"author_seq":203,"given_name":1709,"surname":1710,"affiliation":63,"orcid":63},"Elmer","Bernstam","This paper presents a pilot project named Cancer FrameNet. The project's goal is a general-purpose natural language processing (NLP) resource for cancer-related information in clinical notes (i.e., patient records in an electronic health record system). While previous cancer NLP annotation projects have largely been ad hoc resources to address a specific and immediate information need, the frame semantic method employed here emphasizes the information presented in the notes themselves and its linguistic structure. To this end, three semantic frames (targeting the high-level tasks of cancer diagnoses, cancer therapeutic procedures, and tumor descriptions) are created and annotated on a clinical text corpus. Prior to annotation, candidate sentences are extracted from a clinical data warehouse and de-identified to remove any private information. The frames are then annotated with the three frames totaling over thirty frame elements. This paper describes these steps in the pilot project and discusses issues encountered to evaluate the feasibility of general-purpose linguistic resources for extracting cancer-related information.",{"paper_id":1713,"title":1714,"year":81,"month":855,"day":63,"doi":1715,"resource_url":1716,"first_page":63,"last_page":63,"pdf_url":1717,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1718,"paper_type":860,"authors":1719,"abstract":1742},"lrec2018-main-042","A New Corpus to Support Text Mining for the Curation of Metabolites in the ChEBI Database","10.63317\u002F2k4h99c3w2uz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-042","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F229.pdf","shardlow-etal-2018-new",[1720,1722,1725,1728,1730,1733,1736,1739],{"paper_id":1713,"author_seq":247,"given_name":1226,"surname":1721,"affiliation":63,"orcid":63},"Shardlow",{"paper_id":1713,"author_seq":232,"given_name":1723,"surname":1724,"affiliation":63,"orcid":63},"Nhung","Nguyen",{"paper_id":1713,"author_seq":218,"given_name":1726,"surname":1727,"affiliation":63,"orcid":63},"Gareth","Owen",{"paper_id":1713,"author_seq":203,"given_name":1217,"surname":1729,"affiliation":63,"orcid":63},"O’Donovan",{"paper_id":1713,"author_seq":188,"given_name":1731,"surname":1732,"affiliation":63,"orcid":63},"Andrew","Leach",{"paper_id":1713,"author_seq":172,"given_name":1734,"surname":1735,"affiliation":63,"orcid":63},"John","McNaught",{"paper_id":1713,"author_seq":155,"given_name":1737,"surname":1738,"affiliation":63,"orcid":63},"Steve","Turner",{"paper_id":1713,"author_seq":138,"given_name":1740,"surname":1741,"affiliation":63,"orcid":63},"Sophia","Ananiadou","We present a new corpus of 200 abstracts and 100 full text papers which have been annotated with named entities and relations in the biomedical domain as part of the OpenMinTeD project. This corpus facilitates the goal in OpenMinTeD of making text and data mining accessible to the users who need it most. We describe the process we took to annotate the corpus with entities (Metabolite, Chemical, Protein, Species, Biological Activity and Spectral Data) and relations (Isolated From, Associated With, Binds With and Metabolite Of). We report inter-annotator agreement (using F-score) for entities of between 0.796 and 0.892 using a strict matching protocol and between 0.875 and 0.963 using a relaxed matching protocol. For relations we report inter annotator agreement of between 0.591 and 0.693 using a strict matching protocol and between 0.744 and 0.793 using a relaxed matching protocol. We describe how this corpus can be used within ChEBI to facilitate text and data mining and how the integration of this work with the OpenMinTeD text and data mining platform will aid curation of ChEBI and other biomedical databases.",{"paper_id":1744,"title":1745,"year":81,"month":855,"day":63,"doi":1746,"resource_url":1747,"first_page":63,"last_page":63,"pdf_url":1748,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1749,"paper_type":860,"authors":1750,"abstract":1761},"lrec2018-main-043","Parallel Corpora for the Biomedical Domain","10.63317\u002F4jwzppkn3qj9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-043","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F854.pdf","neveol-etal-2018-parallel",[1751,1752,1755,1758],{"paper_id":1744,"author_seq":247,"given_name":1403,"surname":1404,"affiliation":63,"orcid":63},{"paper_id":1744,"author_seq":232,"given_name":1753,"surname":1754,"affiliation":63,"orcid":63},"Antonio","Jimeno Yepes",{"paper_id":1744,"author_seq":218,"given_name":1756,"surname":1757,"affiliation":63,"orcid":63},"Mariana","Neves",{"paper_id":1744,"author_seq":203,"given_name":1759,"surname":1760,"affiliation":63,"orcid":63},"Karin","Verspoor","A vast amount of biomedical information is available in the form of scientific literature and government-authored patient information documents. While English is the most widely used language in many of these sources, there is a need to provide access to health information in languages other than English. Parallel corpora can be leveraged to implement cross-lingual information retrieval or machine translation tools. Herein, we review the extent of parallel corpus coverage in the biomedical domain. Specifically, we perform a scoping review of existing resources and we describe the recent development of new datasets for scientific literature (the EDP dataset and an extension of the Scielo corpus) and clinical trials (the ReBEC corpus). These corpora are currently being used in the biomedical task in the Conference on Machine Translation (WMT’16 and WMT’17), which illustrates their potential for improving and evaluating biomedical machine translation systems. Furthermore, we suggest additional applications for multilingual natural language processing using these resources, and plan to extend resource coverage to additional text genres and language pairs.",{"paper_id":1763,"title":1764,"year":81,"month":855,"day":63,"doi":1765,"resource_url":1766,"first_page":63,"last_page":63,"pdf_url":1767,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1768,"paper_type":860,"authors":1769,"abstract":1787},"lrec2018-main-044","Medical Entity Corpus with PICO elements and Sentiment Analysis","10.63317\u002F5myonnfuk3u3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-044","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1088.pdf","zlabinger-etal-2018-medical",[1770,1773,1776,1779,1781,1784],{"paper_id":1763,"author_seq":247,"given_name":1771,"surname":1772,"affiliation":63,"orcid":63},"Markus","Zlabinger",{"paper_id":1763,"author_seq":232,"given_name":1774,"surname":1775,"affiliation":63,"orcid":63},"Linda","Andersson",{"paper_id":1763,"author_seq":218,"given_name":1777,"surname":1778,"affiliation":63,"orcid":63},"Allan","Hanbury",{"paper_id":1763,"author_seq":203,"given_name":1780,"surname":1775,"affiliation":63,"orcid":63},"Michael",{"paper_id":1763,"author_seq":188,"given_name":1782,"surname":1783,"affiliation":63,"orcid":63},"Vanessa","Quasnik",{"paper_id":1763,"author_seq":172,"given_name":1785,"surname":1786,"affiliation":63,"orcid":63},"Jon","Brassey","In this paper, we present our process to establish a PICO and a sentiment annotated corpus of clinical trial publications. PICO stands for Population, Intervention, Comparison and Outcome --- these four classes can be used for more advanced and specific search queries. For example, a physician can determine how well a drug works only in the subgroup of children. Additionally to the PICO extraction, we conducted a sentiment annotation, where the sentiment refers to whether the conclusion of a trial was positive, negative or neutral. We created both corpora with the help of medical experts and non-experts as annotators.",{"paper_id":1789,"title":1790,"year":81,"month":855,"day":63,"doi":1791,"resource_url":1792,"first_page":63,"last_page":63,"pdf_url":1793,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1794,"paper_type":860,"authors":1795,"abstract":1802},"lrec2018-main-045","Word Embedding Approach for Synonym Extraction of Multi-Word Terms","10.63317\u002F3u6bwp3okbvm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-045","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F36.pdf","hazem-daille-2018-word",[1796,1799],{"paper_id":1789,"author_seq":247,"given_name":1797,"surname":1798,"affiliation":63,"orcid":63},"Amir","Hazem",{"paper_id":1789,"author_seq":232,"given_name":1800,"surname":1801,"affiliation":63,"orcid":63},"Béatrice","Daille","The acquisition of synonyms and quasi-synonyms of multi-word terms (MWTs) is a relatively new and under represented topic of research. However, dealing with MWT synonyms and semantically related terms is a challenging task, especially when MWT synonyms are single word terms (SWTs) or MWTs of different lengths. While several researches addressed synonym extraction of SWTs, few of them dealt with MWTs and fewer or none while MWTs synonyms are of variable lengths. The present research aims at introducing a new word-embedding-based approach for the automatic acquisition of synonyms of MWTs that manage length variability. We evaluate our approach on two specialized domain corpora, a French\u002FEnglish corpus of the wind energy domain and  a French\u002FEnglish corpus of the breast cancer domain and show superior results compared to baseline approaches.",{"paper_id":1804,"title":1805,"year":81,"month":855,"day":63,"doi":1806,"resource_url":1807,"first_page":63,"last_page":63,"pdf_url":1808,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1809,"paper_type":860,"authors":1810,"abstract":1815},"lrec2018-main-046","A Large Automatically-Acquired All-Words List of Multiword Expressions Scored for Compositionality","10.63317\u002F4ztx49hfn2ce","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-046","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F650.pdf","roberts-egg-2018-large",[1811,1813],{"paper_id":1804,"author_seq":247,"given_name":1812,"surname":1701,"affiliation":63,"orcid":63},"Will",{"paper_id":1804,"author_seq":232,"given_name":1771,"surname":1814,"affiliation":63,"orcid":63},"Egg","We present and make available a large automatically-acquired all-words list of English multiword expressions scored for compositionality.  Intrinsic evaluation against manually-produced gold standards demonstrates that our compositionality estimates are sound, and extrinsic evaluation via incorporation of our list into a machine translation system to better handle idiomatic expressions results in a statistically significant improvement to the system's BLEU scores.  As the method used to produce the list is language-independent, we also make available lists in seven other European languages.",{"paper_id":1817,"title":1818,"year":81,"month":855,"day":63,"doi":1819,"resource_url":1820,"first_page":63,"last_page":63,"pdf_url":1821,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1822,"paper_type":860,"authors":1823,"abstract":1827},"lrec2018-main-047","A Hybrid Approach for Automatic Extraction of Bilingual Multiword Expressions from Parallel Corpora","10.63317\u002F5okjohvt6aax","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-047","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F958.pdf","semmar-2018-hybrid",[1824],{"paper_id":1817,"author_seq":247,"given_name":1825,"surname":1826,"affiliation":63,"orcid":63},"Nasredine","Semmar","Specific-domain bilingual lexicons play an important role for domain adaptation in machine translation. The entries of these types of lexicons are mostly composed of MultiWord Expressions (MWEs). The manual construction of MWEs bilingual lexicons is costly and time-consuming. We often use word alignment approaches to automatically construct bilingual lexicons of MWEs from parallel corpora. We present in this paper a hybrid approach to extract and align MWEs from parallel corpora in a one-step process. We formalize the alignment process as an integer linear programming problem in order to find an approximated optimal solution. This process generates lists of MWEs with their translations, which are then filtered using linguistic patterns for the construction of the bilingual lexicons of MWEs. We evaluate the bilingual lexicons of MWEs produced by this approach using two methods: a manual evaluation of the alignment quality and an evaluation of the impact of this alignment on the translation quality of the phrase-based statistical machine translation system Moses. We experimentally show that the integration of the bilingual MWEs and their linguistic information into the translation model improves the performance of Moses.",{"paper_id":1829,"title":1830,"year":81,"month":855,"day":63,"doi":1831,"resource_url":1832,"first_page":63,"last_page":63,"pdf_url":1833,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1834,"paper_type":860,"authors":1835,"abstract":1848},"lrec2018-main-048","No more beating about the bush : A Step towards Idiom Handling for Indian Language NLP","10.63317\u002F4ywaxwj2mz6o","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-048","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1052.pdf","agrawal-etal-2018-beating",[1836,1839,1842,1845],{"paper_id":1829,"author_seq":247,"given_name":1837,"surname":1838,"affiliation":63,"orcid":63},"Ruchit","Agrawal",{"paper_id":1829,"author_seq":232,"given_name":1840,"surname":1841,"affiliation":63,"orcid":63},"Vighnesh","Chenthil Kumar",{"paper_id":1829,"author_seq":218,"given_name":1843,"surname":1844,"affiliation":63,"orcid":63},"Vigneshwaran","Muralidharan",{"paper_id":1829,"author_seq":203,"given_name":1846,"surname":1847,"affiliation":63,"orcid":63},"Dipti","Sharma","One of the major challenges in the field of Natural Language Processing (NLP) is the handling of idioms; seemingly ordinary phrases which could be further conjugated or even spread across the sentence to fit the context. Since idioms are a part of natural language, the ability to tackle them brings us closer to creating efficient NLP tools. This paper presents a multilingual parallel idiom dataset for seven Indian languages in addition to English and demonstrates its usefulness for two NLP applications - Machine Translation and Sentiment Analysis. We observe significant improvement for both the subtasks over baseline models trained without employing the idiom dataset.",{"paper_id":1850,"title":1851,"year":81,"month":855,"day":63,"doi":1852,"resource_url":1853,"first_page":63,"last_page":63,"pdf_url":1854,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1855,"paper_type":860,"authors":1856,"abstract":1866},"lrec2018-main-049","Sentence Level Temporality Detection using an Implicit Time-sensed Resource","10.63317\u002F48v7xjaaimdh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-049","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F210.pdf","kamila-etal-2018-sentence",[1857,1860,1863],{"paper_id":1850,"author_seq":247,"given_name":1858,"surname":1859,"affiliation":63,"orcid":63},"Sabyasachi","Kamila",{"paper_id":1850,"author_seq":232,"given_name":1861,"surname":1862,"affiliation":63,"orcid":63},"Asif","Ekbal",{"paper_id":1850,"author_seq":218,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},"Pushpak","Bhattacharyya","Temporal sense detection of any word is an important aspect for detecting temporality at the sentence level. In this paper, at first, we build a temporal resource based on a semi-supervised learning approach where each Hindi-WordNet synset is classified into one of the five classes, namely past, present, future, neutral and atemporal. This resource is then utilized for tagging the sentences with past, present and future temporal senses. For the sentence-level tagging, we use a rule-based as well as a machine learning-based approach. We provide detailed analysis along with necessary resources.",{"paper_id":1868,"title":1869,"year":81,"month":855,"day":63,"doi":1870,"resource_url":1871,"first_page":63,"last_page":63,"pdf_url":1872,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1873,"paper_type":860,"authors":1874,"abstract":1884},"lrec2018-main-050","Comprehensive Annotation of Various Types of Temporal Information on the Time Axis","10.63317\u002F24rffczi9c4s","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-050","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F579.pdf","sakaguchi-etal-2018-comprehensive",[1875,1878,1881],{"paper_id":1868,"author_seq":247,"given_name":1876,"surname":1877,"affiliation":63,"orcid":63},"Tomohiro","Sakaguchi",{"paper_id":1868,"author_seq":232,"given_name":1879,"surname":1880,"affiliation":63,"orcid":63},"Daisuke","Kawahara",{"paper_id":1868,"author_seq":218,"given_name":1882,"surname":1883,"affiliation":63,"orcid":63},"Sadao","Kurohashi","In order to make the temporal interpretation of text, there have been many studies linking event and temporal information, such as temporal ordering of events and timeline generation. To train and evaluate models in these studies, many corpora that associate event information with time information have been developed. In this paper, we propose an annotation scheme that anchors expressions in text to the time axis comprehensively, extending the previous studies in the following two points. One of the points is to annotate not only expressions with strong temporality but also expressions with weak temporality, such as states and habits. The other point is that various types of temporal information, such as frequency and duration, can be anchored to the time axis. Using this annotation scheme, we annotated a subset of Kyoto University Text Corpus. Since the corpus has already been annotated predicate-argument structures and coreference relations, it can be utilized for integrated information analysis of events, entities and time.",{"paper_id":1886,"title":1887,"year":81,"month":855,"day":63,"doi":1888,"resource_url":1889,"first_page":63,"last_page":63,"pdf_url":1890,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1891,"paper_type":860,"authors":1892,"abstract":1899},"lrec2018-main-051","Systems’ Agreements and Disagreements in Temporal Processing: An Extensive Error Analysis of the TempEval-3 Task","10.63317\u002F3qenjqy9hs6k","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-051","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F880.pdf","caselli-morante-2018-systems",[1893,1896],{"paper_id":1886,"author_seq":247,"given_name":1894,"surname":1895,"affiliation":63,"orcid":63},"Tommaso","Caselli",{"paper_id":1886,"author_seq":232,"given_name":1897,"surname":1898,"affiliation":63,"orcid":63},"Roser","Morante","In this article we review Temporal Processing systems that participated in the TempEval-3 task as a basis to develop our own system, that we also present and release. The system incorporates high level lexical semantic features, obtaining the best scores for event detection (F1-Class 72.24) and second best result for temporal relation classification from raw text (F1 29.69) when evaluated on the TempEval-3 data. Additionally, we analyse the errors of all TempEval-3 systems for which the output is publicly available with the purpose of finding out what are the weaknesses of current approaches. Although incorporating lexical semantics features increases the performance of our system, the error analysis shows that systems should incorporate inference mechanisms and world knowledge, as well as having strategies to compensate for data skewness.",{"paper_id":1901,"title":1902,"year":81,"month":855,"day":63,"doi":1903,"resource_url":1904,"first_page":63,"last_page":63,"pdf_url":1905,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1906,"paper_type":860,"authors":1907,"abstract":1914},"lrec2018-main-052","Annotating Temporally-Anchored Spatial Knowledge by Leveraging Syntactic Dependencies","10.63317\u002F3isfrdhrkv79","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-052","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F951.pdf","vempala-blanco-2018-annotating",[1908,1911],{"paper_id":1901,"author_seq":247,"given_name":1909,"surname":1910,"affiliation":63,"orcid":63},"Alakananda","Vempala",{"paper_id":1901,"author_seq":232,"given_name":1912,"surname":1913,"affiliation":63,"orcid":63},"Eduardo","Blanco","This paper presents a two-step methodology to annotate temporally-anchored spatial knowledge on top of OntoNotes. We first generate potential knowledge using syntactic dependencies, and then crowdsource annotations to validate the potential knowledge. The resulting annotations indicate how long entities are or are not located somewhere, and temporally anchor this information. Crowdsourcing experiments show that spatial inferences are ubiquitous and intuitive, and experimental results show that they can be done automatically.",{"paper_id":1916,"title":1917,"year":81,"month":855,"day":63,"doi":1918,"resource_url":1919,"first_page":63,"last_page":63,"pdf_url":1920,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1921,"paper_type":860,"authors":1922,"abstract":1929},"lrec2018-main-053","Contextualized Usage-Based Material Selection","10.63317\u002F49xxephfqe9i","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-053","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F729.pdf","de-hertog-desmet-2018-contextualized",[1923,1926],{"paper_id":1916,"author_seq":247,"given_name":1924,"surname":1925,"affiliation":63,"orcid":63},"Dirk","De Hertog",{"paper_id":1916,"author_seq":232,"given_name":1927,"surname":1928,"affiliation":63,"orcid":63},"Piet","Desmet","In this paper, we combine several NLP-functionalities to organize examples drawn from corpora. The application’s primary target audience are language learners. Currently, authentic linguistic examples for a given keyword search are often organized alphabetically according to context. From this, it is not always clear which contextual regularities actually exist on a syntactic, collocational and semantic level. Showing information at different levels of abstraction will help with the discovery of linguistic regularities and thus improve linguistic understanding. Practically this translates in a system that groups retrieved results on syntactic grounds, after which the examples are further organized at the hand of semantic similarity within certain phrasal slots. Visualization algorithms are then used to show focused information in phrasal slots, laying bare semantic restrictions within the construction.",{"paper_id":1931,"title":1932,"year":81,"month":855,"day":63,"doi":1933,"resource_url":1934,"first_page":63,"last_page":63,"pdf_url":1935,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1936,"paper_type":860,"authors":1937,"abstract":1943},"lrec2018-main-054","CBFC: a parallel L2 speech corpus for Korean and French learners","10.63317\u002F2a4tswabudyr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-054","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F874.pdf","yoo-kim-2018-cbfc",[1938,1941],{"paper_id":1931,"author_seq":247,"given_name":1939,"surname":1940,"affiliation":63,"orcid":63},"Hiyon","Yoo",{"paper_id":1931,"author_seq":232,"given_name":1942,"surname":1104,"affiliation":63,"orcid":63},"Inyoung","In this paper, we present the design of a bilingual corpus of French learners of Korean and Korean learners of French using the same experimental design. This language resource contains mainly speech data, gathered among learners with different proficiency levels and in different speaking contexts (read and spontaneous speech). We aim at providing a translated and annotated corpus to the scientific community which can be used for a large array of purposes in the field of theoretical but also applied linguistics.",{"paper_id":1945,"title":1946,"year":81,"month":855,"day":63,"doi":1947,"resource_url":1948,"first_page":63,"last_page":63,"pdf_url":1949,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1950,"paper_type":860,"authors":1951,"abstract":1961},"lrec2018-main-055","SW4ALL: a CEFR Classified and Aligned Corpus for Language Learning","10.63317\u002F25wrmjy63ec9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-055","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1012.pdf","wilkens-etal-2018-sw4all",[1952,1955,1958],{"paper_id":1945,"author_seq":247,"given_name":1953,"surname":1954,"affiliation":63,"orcid":63},"Rodrigo","Wilkens",{"paper_id":1945,"author_seq":232,"given_name":1956,"surname":1957,"affiliation":63,"orcid":63},"Leonardo","Zilio",{"paper_id":1945,"author_seq":218,"given_name":1959,"surname":1960,"affiliation":63,"orcid":63},"Cédrick","Fairon","Learning a second language is a task that requires a good amount of time and dedication. Part of the process involves the reading and writing of texts in the target language, and so, to facilitate this process, especially in terms of reading, teachers tend to search for texts that are associated to the interests and capabilities of the learners. But the search for this kind of text is also a time-consuming task. By focusing on this need for texts that are suited for different language learners, we present in this study the SW4ALL, a corpus  with documents classified by language proficiency level (based on the CEFR recommendations) that allows the learner to observe ways of describing the same topic or content by using strategies from different proficiency levels. This corpus uses the alignments between the English Wikipedia and the Simple English Wikipedia for ensuring the use of similar content or topic in pairs of text, and an annotation of language levels for ensuring the difference of language proficiency level between them. Considering the size of the corpus, we used an automatic approach for the annotation, followed by an analysis to sort out annotation errors. SW4ALL contains 8.669 pairs of documents that present different levels of language proficiency.",{"paper_id":1963,"title":1964,"year":81,"month":855,"day":63,"doi":1965,"resource_url":1966,"first_page":63,"last_page":63,"pdf_url":1967,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1968,"paper_type":860,"authors":1969,"abstract":1974},"lrec2018-main-056","Towards a Diagnosis of Textual Difficulties for Children with Dyslexia","10.63317\u002F3pemju5cmqox","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-056","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F891.pdf","quiniou-daille-2018-towards",[1970,1973],{"paper_id":1963,"author_seq":247,"given_name":1971,"surname":1972,"affiliation":63,"orcid":63},"Solen","Quiniou",{"paper_id":1963,"author_seq":232,"given_name":1800,"surname":1801,"affiliation":63,"orcid":63},"Children's books are generally designed for children of a certain age group. For underage children or children with reading disorders, like dyslexia, there may be passages of the books that are difficult to understand. This can be due to words not known in the vocabulary of underage children, to words made of complex subparts (to pronounce, for example), or to the presence of anaphoras that have to be resolved by the children during the reading. In this paper, we present a study on diagnosing the difficulties appearing in French children's books. We are more particularly interested on the difficulties coming from pronouns that can disrupt the story comprehension for children with dyslexia and we focus on the subject pronouns \"il\" and \"elle\" (corresponding to the pronoun \"it\"). We automatically identify the pleonastic pronouns (eg, in \"it's raining\") and the pronominal anaphoras, as well as the referents of the pronominal anaphoras. We also detect difficult anaphoras that are more likely to lead to miscomprehension from the children: this is the first step to diagnose the textual difficulties of children's books. We evaluate our approach on several French children's books that were manually annotated by a speech therapist. Our first results show that we are able to detect half of the difficult anaphorical pronouns.",{"paper_id":1976,"title":1977,"year":81,"month":855,"day":63,"doi":1978,"resource_url":1979,"first_page":63,"last_page":63,"pdf_url":1980,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1981,"paper_type":860,"authors":1982,"abstract":1992},"lrec2018-main-057","Coreference Resolution in FreeLing 4.0","10.63317\u002F2ksuhw9wd86d","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-057","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F40.pdf","marimon-etal-2018-coreference",[1983,1986,1989],{"paper_id":1976,"author_seq":247,"given_name":1984,"surname":1985,"affiliation":63,"orcid":63},"Montserrat","Marimon",{"paper_id":1976,"author_seq":232,"given_name":1987,"surname":1988,"affiliation":63,"orcid":63},"Lluís","Padró",{"paper_id":1976,"author_seq":218,"given_name":1990,"surname":1991,"affiliation":63,"orcid":63},"Jordi","Turmo","This paper presents the integration of RelaxCor into FreeLing. RelaxCor is a coreference resolution system based on constraint satisfaction that ranked second in CoNLL-2011 shared task.  FreeLing is an open-source library for NLP with more than fifteen years of existence and a widespread user community. We present the difficulties found in porting RelaxCor from a shared task scenario to a production enviroment, as well as the solutions devised. We present two strategies for this integration and a rough evaluation of the obtained results.",{"paper_id":1994,"title":1995,"year":81,"month":855,"day":63,"doi":1996,"resource_url":1997,"first_page":63,"last_page":63,"pdf_url":1998,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":1999,"paper_type":860,"authors":2000,"abstract":2004},"lrec2018-main-058","BASHI: A Corpus of Wall Street Journal Articles Annotated with Bridging Links","10.63317\u002F4zt9i5mar6pv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-058","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F84.pdf","rosiger-2018-bashi",[2001],{"paper_id":1994,"author_seq":247,"given_name":2002,"surname":2003,"affiliation":63,"orcid":63},"Ina","Rösiger","This paper presents a corpus resource for the anaphoric phenomenon of bridging, named BASHI. The corpus consisting of 50 Wall Street Journal (WSJ) articles adds bridging anaphors and their antecedents to the other gold annotations that have been created as part of the OntoNotes project (Weischedel et al. 2011). Bridging anaphors are context-dependent expressions that do not refer to the same entity as their antecedent, but to a related entity. Bridging resolution is an under-researched area of NLP, where the lack of annotated training data makes the application of statistical models difficult. Thus, we believe that the corpus is a valuable resource for researchers interested in anaphoric phenomena going beyond coreference, as it can be combined with other corpora to create a larger corpus resource. The corpus contains 57,709 tokens and 459 bridging pairs and is available for download in an offset-based format and a CoNLL-12 style bridging column that can be merged with the other annotation layers in OntoNotes. The paper also reviews previous annotation efforts and different definitions of bridging and reports challenges with respect to the bridging annotation.",{"paper_id":2006,"title":2007,"year":81,"month":855,"day":63,"doi":2008,"resource_url":2009,"first_page":63,"last_page":63,"pdf_url":2010,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2011,"paper_type":860,"authors":2012,"abstract":2016},"lrec2018-main-059","SACR: A Drag-and-Drop Based Tool for Coreference Annotation","10.63317\u002F3cghguv9ewe7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-059","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F178.pdf","oberle-2018-sacr",[2013],{"paper_id":2006,"author_seq":247,"given_name":2014,"surname":2015,"affiliation":63,"orcid":63},"Bruno","Oberle","This paper introduces SACR, an easy-to-use coreference chain annotation tool, which is used to annotate large corpora for Natural Language Processing applications. Coreference annotation is usually considered as costly both in terms of time and human resources. So, in order to find the easiest annotation strategy, we will first of all compare several annotation schemes implemented in existing tools. Since interface ergonomics is also an important part of our research, we then focus on identifying the most helpful features to reduce the strain for annotators. In the next section of the paper, we present SACR in details. This tool has been developped specifically for coreference annotation, and its intuitive user interface has been designed to facilitate and speed up the annotation process, making SACR equally suited for students, occasional and non-technical users. In order to create coreference chains, elements are selected by clicking on the corresponding tokens. Coreference relations are then created by drag-and-dropping expressions one over the other. Finally, color frames around marked expressions help the user to visualize both marked expressions and their relations. SACR is open source, distributed under the terms of the Mozilla Public License, version 2.0, and freely available online.",{"paper_id":2018,"title":2019,"year":81,"month":855,"day":63,"doi":2020,"resource_url":2021,"first_page":63,"last_page":63,"pdf_url":2022,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2023,"paper_type":860,"authors":2024,"abstract":2034},"lrec2018-main-060","Deep Neural Networks for Coreference Resolution for Polish","10.63317\u002F3zspjtf5bc5p","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-060","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F183.pdf","niton-etal-2018-deep",[2025,2028,2031],{"paper_id":2018,"author_seq":247,"given_name":2026,"surname":2027,"affiliation":63,"orcid":63},"Bartłomiej","Nitoń",{"paper_id":2018,"author_seq":232,"given_name":2029,"surname":2030,"affiliation":63,"orcid":63},"Paweł","Morawiecki",{"paper_id":2018,"author_seq":218,"given_name":2032,"surname":2033,"affiliation":63,"orcid":63},"Maciej","Ogrodniczuk","The paper presents several configurations of deep neural networks aimed at the task of coreference resolution for Polish. Starting with the basic feature set and standard word embedding vector size we examine the setting with larger vectors, more extensive sets of mention features, increased number of negative examples, Siamese network architecture and a global mention connection algorithm. The highest results are achieved by the system combining our best deep neural architecture with the sieve-based approach – the cascade of rule-based coreference resolvers ordered from most to least precise. All systems are evaluated on the data of the Polish Coreference Corpus featuring 540K tokens and 180K mentions. The best variant improves the state of the art for Polish by 0.53 F1 points, reaching 81.23 points of the CoNLL metric.",{"paper_id":2036,"title":2037,"year":81,"month":855,"day":63,"doi":2038,"resource_url":2039,"first_page":63,"last_page":63,"pdf_url":2040,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2041,"paper_type":860,"authors":2042,"abstract":2055},"lrec2018-main-061","SzegedKoref: A Hungarian Coreference Corpus","10.63317\u002F5hghr677923d","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-061","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F325.pdf","vincze-etal-2018-szegedkoref",[2043,2046,2049,2052],{"paper_id":2036,"author_seq":247,"given_name":2044,"surname":2045,"affiliation":63,"orcid":63},"Veronika","Vincze",{"paper_id":2036,"author_seq":232,"given_name":2047,"surname":2048,"affiliation":63,"orcid":63},"Klára","Hegedűs",{"paper_id":2036,"author_seq":218,"given_name":2050,"surname":2051,"affiliation":63,"orcid":63},"Alex","Sliz-Nagy",{"paper_id":2036,"author_seq":203,"given_name":2053,"surname":2054,"affiliation":63,"orcid":63},"Richárd","Farkas","In this paper we introduce SzegedKoref, a Hungarian corpus in which coreference relations are manually annotated. For annotation, we selected some texts of Szeged Treebank, the biggest treebank of Hungarian with manual annotation at several linguistic layers. The corpus contains approximately 55,000 tokens and 4000 sentences. Due to its size, the corpus can be exploited in training and testing machine learning based coreference resolution systems, which we would like to implement in the near future. We present the annotated texts, we describe the annotated categories of anaphoric relations, we report on the annotation process and we offer several examples of each annotated category. Two linguistic phenomena -- phonologically empty pronouns and pronouns referring to subordinate clauses -- are important characteristics of Hungarian coreference relations. In our paper, we also discuss both of them.",{"paper_id":2057,"title":2058,"year":81,"month":855,"day":63,"doi":2059,"resource_url":2060,"first_page":63,"last_page":63,"pdf_url":2061,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2062,"paper_type":860,"authors":2063,"abstract":2070},"lrec2018-main-062","A Corpus to Learn Refer-to-as Relations for Nominals","10.63317\u002F2ciroz5jfo3k","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-062","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F328.pdf","ahmad-chang-2018-corpus",[2064,2067],{"paper_id":2057,"author_seq":247,"given_name":2065,"surname":2066,"affiliation":63,"orcid":63},"Wasi","Ahmad",{"paper_id":2057,"author_seq":232,"given_name":2068,"surname":2069,"affiliation":63,"orcid":63},"Kai-Wei","Chang","Continuous representations for words or phrases, trained on large unlabeled corpora are proved very useful for many natural language processing tasks. While these vector representations capture many fine-grained syntactic and semantic regularities among words or phrases, it often lacks coreferential information which is useful for many downstream tasks like information extraction, text summarization etc. In this paper, we argue that good word and phrase embeddings should contain information for identifying refer-to-as relationship and construct a corpus from Wikipedia to generate coreferential neural embeddings for nominals. The term nominal refers to a word or a group of words that functions like a noun phrase. In addition, we use coreference resolution as a proxy to evaluate the learned neural embeddings for noun phrases. To simplify the evaluation procedure, we design a coreferential phrase prediction task where the learned nominal embeddings are used to predict which candidate nominals can be referred to a target nominal. We further describe how to construct an evaluation dataset for such task from well known OntoNotes corpus and demonstrate encouraging baseline results.",{"paper_id":2072,"title":2073,"year":81,"month":855,"day":63,"doi":2074,"resource_url":2075,"first_page":63,"last_page":63,"pdf_url":2076,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2077,"paper_type":860,"authors":2078,"abstract":2098},"lrec2018-main-063","Sanaphor++: Combining Deep Neural Networks with Semantics for Coreference Resolution","10.63317\u002F4ipdmz3antfj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-063","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F740.pdf","plu-etal-2018-sanaphor",[2079,2082,2085,2088,2091,2094,2097],{"paper_id":2072,"author_seq":247,"given_name":2080,"surname":2081,"affiliation":63,"orcid":63},"Julien","Plu",{"paper_id":2072,"author_seq":232,"given_name":2083,"surname":2084,"affiliation":63,"orcid":63},"Roman","Prokofyev",{"paper_id":2072,"author_seq":218,"given_name":2086,"surname":2087,"affiliation":63,"orcid":63},"Alberto","Tonon",{"paper_id":2072,"author_seq":203,"given_name":2089,"surname":2090,"affiliation":63,"orcid":63},"Philippe","Cudré-Mauroux",{"paper_id":2072,"author_seq":188,"given_name":2092,"surname":2093,"affiliation":63,"orcid":63},"Djellel Eddine","Difallah",{"paper_id":2072,"author_seq":172,"given_name":2095,"surname":2096,"affiliation":63,"orcid":63},"Raphaël","Troncy",{"paper_id":2072,"author_seq":155,"given_name":1679,"surname":1680,"affiliation":63,"orcid":63},"Coreference resolution has always been a challenging task in Natural Language Processing. Machine learning and semantic techniques have improved the state of the art over the time, though since a few years, the biggest step forward has been made using deep neural networks. In this paper, we describe Sanaphor++, which is an improvement of a top-level deep neural network system for coreference resolution---namely Stanford deep-coref---through the addition of semantic features. The goal of Sanaphor++ is to improve the clustering part of the coreference resolution in order to know if two clusters have to be merged or not once the pairs of mentions have been identified. We evaluate our model over the CoNLL 2012 Shared Task dataset and compare it with the state-of-the-art system (Stanford deep-coref) where we demonstrated an average gain of 1.13\\% of the average F1 score.",{"paper_id":2100,"title":2101,"year":81,"month":855,"day":63,"doi":2102,"resource_url":2103,"first_page":63,"last_page":63,"pdf_url":2104,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2105,"paper_type":860,"authors":2106,"abstract":2120},"lrec2018-main-064","ANCOR-AS: Enriching the ANCOR Corpus with Syntactic Annotations","10.63317\u002F5o33zdvtyt7j","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-064","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F899.pdf","grobol-etal-2018-ancor",[2107,2110,2113,2116,2118],{"paper_id":2100,"author_seq":247,"given_name":2108,"surname":2109,"affiliation":63,"orcid":63},"Loïc","Grobol",{"paper_id":2100,"author_seq":232,"given_name":2111,"surname":2112,"affiliation":63,"orcid":63},"Isabelle","Tellier",{"paper_id":2100,"author_seq":218,"given_name":2114,"surname":2115,"affiliation":63,"orcid":63},"Éric","de la Clergerie",{"paper_id":2100,"author_seq":203,"given_name":922,"surname":2117,"affiliation":63,"orcid":63},"Dinarelli",{"paper_id":2100,"author_seq":188,"given_name":1136,"surname":2119,"affiliation":63,"orcid":63},"Landragin","This paper presents ANCOR-AS, an enriched version of the ANCOR corpus. This version adds syntactic annotations in addition to the existing coreference and speech transcription ones. This corpus is also released in a new TEI-compliant XML format.",{"paper_id":2122,"title":2123,"year":81,"month":855,"day":63,"doi":2124,"resource_url":2125,"first_page":63,"last_page":63,"pdf_url":2126,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2127,"paper_type":860,"authors":2128,"abstract":2137},"lrec2018-main-065","ParCorFull: a Parallel Corpus Annotated with Full Coreference","10.63317\u002F4h7r3ojn4aic","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-065","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F941.pdf","lapshinova-koltunski-etal-2018-parcorfull",[2129,2132,2134],{"paper_id":2122,"author_seq":247,"given_name":2130,"surname":2131,"affiliation":63,"orcid":63},"Ekaterina","Lapshinova-Koltunski",{"paper_id":2122,"author_seq":232,"given_name":904,"surname":2133,"affiliation":63,"orcid":63},"Hardmeier",{"paper_id":2122,"author_seq":218,"given_name":2135,"surname":2136,"affiliation":63,"orcid":63},"Pauline","Krielke","In this paper, we describe a parallel corpus annotated with full coreference chains that has been created to address an important problem that machine translation and other multilingual natural language processing (NLP) technologies face – translation of coreference across languages. Recent research in multilingual coreference and automatic pronoun translation has led to important insights into the problem and some promising results. However, its scope has been restricted to pronouns, whereas the phenomenon is not limited to anaphoric pronouns. Our corpus contains parallel texts for the language pair English-German, two major European languages. Despite being typologically very close, these languages still have systemic differences in the realisation of coreference, and thus pose problems for multilingual coreference resolution and machine translation. Our parallel corpus with full annotation of coreference will be a valuable resource with a variety of uses not only for NLP applications, but also for contrastive linguists and researchers in translation studies. This resource supports research on the mechanisms involved in coreference translation in order to develop a better understanding of the phenomenon. The corpus is available from the LINDAT repository at http:\u002F\u002Fhdl.handle.net\u002F11372\u002FLRT-2614.",{"paper_id":2139,"title":2140,"year":81,"month":855,"day":63,"doi":2141,"resource_url":2142,"first_page":63,"last_page":63,"pdf_url":2143,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2144,"paper_type":860,"authors":2145,"abstract":2159},"lrec2018-main-066","An Application for Building a Polish Telephone Speech Corpus","10.63317\u002F3gb6fwdcyd54","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-066","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F50.pdf","ziolko-etal-2018-application",[2146,2149,2151,2154,2157],{"paper_id":2139,"author_seq":247,"given_name":2147,"surname":2148,"affiliation":63,"orcid":63},"Bartosz","Ziółko",{"paper_id":2139,"author_seq":232,"given_name":996,"surname":2150,"affiliation":63,"orcid":63},"Żelasko",{"paper_id":2139,"author_seq":218,"given_name":2152,"surname":2153,"affiliation":63,"orcid":63},"Ireneusz","Gawlik",{"paper_id":2139,"author_seq":203,"given_name":2155,"surname":2156,"affiliation":63,"orcid":63},"Tomasz","Pędzimąż",{"paper_id":2139,"author_seq":188,"given_name":2155,"surname":2158,"affiliation":63,"orcid":63},"Jadczyk","The paper presents our approach towards building a tool for speech corpus collection of a specific domain content. We describe our iterative approach to the development of this tool, with focus on the most problematic issues at each working stage. Our latest version synchronizes VoIP call management and recording with a web application providing content. The tool was already used and applied for Polish to gather 63 hours of automatically annotated recordings across several domains. Amongst them, we obtained a continuous speech corpus designed with an emphasis on optimal phonetic diversification in relation to the phonetically balanced National Corpus of Polish. We evaluate the usefulness of this data against the GlobalPhone corpus in the task of training an acoustic model for a telephone speech ASR system and show that the model trained on our balanced corpus achieves significantly lower WER in two grammar-based speech recognition tasks - street names and public transport routes numbers.",{"paper_id":2161,"title":2162,"year":81,"month":855,"day":63,"doi":2163,"resource_url":2164,"first_page":63,"last_page":63,"pdf_url":2165,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2166,"paper_type":860,"authors":2167,"abstract":2174},"lrec2018-main-067","CPJD Corpus: Crowdsourced Parallel Speech Corpus of Japanese Dialects","10.63317\u002F23papnq84qja","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-067","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F67.pdf","takamichi-saruwatari-2018-cpjd",[2168,2171],{"paper_id":2161,"author_seq":247,"given_name":2169,"surname":2170,"affiliation":63,"orcid":63},"Shinnosuke","Takamichi",{"paper_id":2161,"author_seq":232,"given_name":2172,"surname":2173,"affiliation":63,"orcid":63},"Hiroshi","Saruwatari","Public parallel corpora of dialects can accelerate related studies such as spoken language processing. Various corpora have been collected using a well-equipped recording environment, such as voice recording in an anechoic room. However, due to geographical and expense issues, it is impossible to use such a perfect recording environment for collecting all existing dialects. To address this problem, we used web-based recording and crowdsourcing platforms to construct a crowdsourced parallel speech corpus of Japanese dialects (CPJD corpus) including parallel text and speech data of 21 Japanese dialects. We recruited native dialect speakers on the crowdsourcing platform, and the hired speakers recorded their dialect speech using their personal computer or smartphone in their homes. This paper shows the results of the data collection and analyzes the audio data in terms of the signal-to-noise ratio and mispronunciations.",{"paper_id":2176,"title":2177,"year":81,"month":855,"day":63,"doi":2178,"resource_url":2179,"first_page":63,"last_page":63,"pdf_url":2180,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2181,"paper_type":860,"authors":2182,"abstract":2189},"lrec2018-main-068","Korean L2 Vocabulary Prediction: Can a Large Annotated Corpus be Used to Train Better Models for Predicting Unknown Words?","10.63317\u002F3dhucyti8f2x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-068","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F272.pdf","yancey-lepage-2018-korean",[2183,2186],{"paper_id":2176,"author_seq":247,"given_name":2184,"surname":2185,"affiliation":63,"orcid":63},"Kevin","Yancey",{"paper_id":2176,"author_seq":232,"given_name":2187,"surname":2188,"affiliation":63,"orcid":63},"Yves","Lepage","Vocabulary knowledge prediction is an important task in lexical text simplification for foreign language learners (L2 learners).  However,  previously studied methods that use hand-crafted rules based on one or two word features have had limited success.  A recent study hypothesized that a supervised learning classifier trained on a large annotated corpus of words unknown by L2 learners may yield better results.  Our study crowdsourced the production of such a corpus for Korean, now consisting of 2,385 annotated passages contributed by 357 distinct L2 learners.  Our preliminary evaluation of models trained on this corpus show favorable results, thus confirming the hypothesis.  In this paper, we describe our methodology for building this resource in detail and analyze its results so that it can be duplicated for other languages.  We also present our preliminary evaluation of models trained on this annotated corpus, the best of which recalls 80% of unknown words with 71% precision.        We make our annotation data available.",{"paper_id":2191,"title":2192,"year":81,"month":855,"day":63,"doi":2193,"resource_url":2194,"first_page":63,"last_page":63,"pdf_url":2195,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2196,"paper_type":860,"authors":2197,"abstract":2225},"lrec2018-main-069","Crowdsourcing-based Annotation of the Accounting Registers of the Italian Comedy","10.63317\u002F2dfkbvjdmnpb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-069","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F286.pdf","granet-etal-2018-crowdsourcing",[2198,2201,2204,2207,2210,2213,2216,2217,2220,2223],{"paper_id":2191,"author_seq":247,"given_name":2199,"surname":2200,"affiliation":63,"orcid":63},"Adeline","Granet",{"paper_id":2191,"author_seq":232,"given_name":2202,"surname":2203,"affiliation":63,"orcid":63},"Benjamin","Hervy",{"paper_id":2191,"author_seq":218,"given_name":2205,"surname":2206,"affiliation":63,"orcid":63},"Geoffrey","Roman-Jimenez",{"paper_id":2191,"author_seq":203,"given_name":2208,"surname":2209,"affiliation":63,"orcid":63},"Marouane","Hachicha",{"paper_id":2191,"author_seq":188,"given_name":2211,"surname":2212,"affiliation":63,"orcid":63},"Emmanuel","Morin",{"paper_id":2191,"author_seq":172,"given_name":2214,"surname":2215,"affiliation":63,"orcid":63},"Harold","Mouchère",{"paper_id":2191,"author_seq":155,"given_name":1971,"surname":1972,"affiliation":63,"orcid":63},{"paper_id":2191,"author_seq":138,"given_name":2218,"surname":2219,"affiliation":63,"orcid":63},"Guillaume","Raschia",{"paper_id":2191,"author_seq":121,"given_name":2221,"surname":2222,"affiliation":63,"orcid":63},"Françoise","Rubellin",{"paper_id":2191,"author_seq":104,"given_name":904,"surname":2224,"affiliation":63,"orcid":63},"Viard-Gaudin","In this paper, we present a double annotation system for new handwritten historical documents. We have 25,250 pages of registers of the Italian Comedy of the 18th century containing a great variety and amount of information. A crowdsourcing platform has been set up in order to perform labeling and transcription of the documents.  The main purpose is to grasp budget data from the all 18th century and to create a dedicated database for the domain's experts. In order to improve, help and accelerate the process, a parallel system has been designed to automatically process information. We focus on the titles field, segmenting them into lines and checking candidate transcripts. We have collected a base of 971 title lines.",{"paper_id":2227,"title":2228,"year":81,"month":855,"day":63,"doi":2229,"resource_url":2230,"first_page":63,"last_page":63,"pdf_url":2231,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2232,"paper_type":860,"authors":2233,"abstract":2243},"lrec2018-main-070","FEIDEGGER: A Multi-modal Corpus of Fashion Images and Descriptions in German","10.63317\u002F48kpkwvk9g69","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-070","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F319.pdf","lefakis-etal-2018-feidegger",[2234,2237,2240],{"paper_id":2227,"author_seq":247,"given_name":2235,"surname":2236,"affiliation":63,"orcid":63},"Leonidas","Lefakis",{"paper_id":2227,"author_seq":232,"given_name":2238,"surname":2239,"affiliation":63,"orcid":63},"Alan","Akbik",{"paper_id":2227,"author_seq":218,"given_name":2241,"surname":2242,"affiliation":63,"orcid":63},"Roland","Vollgraf","The availability of multi-modal datasets that pair images and textual descriptions of their content has been a crucial driver in progress of various text-image tasks such as automatic captioning and text-to-image retrieval. In this paper, we present FEIDEGGER, a new multi-modal corpus that focuses specifically on the domain of fashion items and their visual descriptions in German. We argue that such narrow-domain multi-modality presents a unique set of challenges such as fine-grained image distinctions and domain-specific language, and release this dataset to the research community to enable study of these challenges. This paper illustrates our crowdsourcing strategy to acquire the textual descriptions, gives an overview over the \\dataset~dataset, and discusses possible use cases.",{"paper_id":2245,"title":2246,"year":81,"month":855,"day":63,"doi":2247,"resource_url":2248,"first_page":63,"last_page":63,"pdf_url":2249,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2250,"paper_type":860,"authors":2251,"abstract":2258},"lrec2018-main-071","Toward a Lightweight Solution for Less-resourced Languages: Creating a POS Tagger for Alsatian Using Voluntary Crowdsourcing","10.63317\u002F3oce9v554vus","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-071","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F326.pdf","millour-fort-2018-toward",[2252,2255],{"paper_id":2245,"author_seq":247,"given_name":2253,"surname":2254,"affiliation":63,"orcid":63},"Alice","Millour",{"paper_id":2245,"author_seq":232,"given_name":2256,"surname":2257,"affiliation":63,"orcid":63},"Karën","Fort","We present here the results of an experiment aiming at crowdsourcing part-of-speech annotations for a less-resourced French regional language, Alsatian. We used for this purpose a specifically-developed slightly gamified platform, Bisame. It allowed us to gather annotations on a variety of corpora covering some of the language dialectal variations. The quality of the annotations, which reach an averaged F-measure of 93%, enabled us to train a first tagger for Alsatian that is nearly 84% accurate. The platform as well as the produced annotations and tagger are freely available. The platform can easily be adapted to other languages, thus providing a solution to (some of) the less-resourced languages issue.",{"paper_id":2260,"title":2261,"year":81,"month":855,"day":63,"doi":2262,"resource_url":2263,"first_page":63,"last_page":63,"pdf_url":2264,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2265,"paper_type":860,"authors":2266,"abstract":2273},"lrec2018-main-072","Crowdsourced Corpus of Sentence Simplification with Core Vocabulary","10.63317\u002F2a5ax2twwiob","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-072","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F327.pdf","katsuta-yamamoto-2018-crowdsourced",[2267,2270],{"paper_id":2260,"author_seq":247,"given_name":2268,"surname":2269,"affiliation":63,"orcid":63},"Akihiro","Katsuta",{"paper_id":2260,"author_seq":232,"given_name":2271,"surname":2272,"affiliation":63,"orcid":63},"Kazuhide","Yamamoto","We present a new Japanese crowdsourced data set of simplified sentences created from more complex ones. Our simplicity standard involves all rewritable words in the simplified sentences being drawn from a core vocabulary of 2,000 words. Our simplified corpus is a collection of complex sentences from Japanese textbooks and reference books together with simplified sentences generated by humans, paired with data on how the complex sentences were paraphrased. The corpus contains a total of 15,000 sentences, in both complex and simple versions. In addition, we investigate the differences in the simplification operations used by each annotator. The aim is to understand whether a crowdsourced complex-simple parallel corpus is an appropriate data source for automated simplification by machine learning. The results, that there was a high level of agreement between the annotators building the data set. So, we believe that this corpus is a good quality data set for machine learning for simplification. We therefore plan to expand the scale of the simplified corpus in the future.",{"paper_id":2275,"title":2276,"year":81,"month":855,"day":63,"doi":2277,"resource_url":2278,"first_page":63,"last_page":63,"pdf_url":2279,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2280,"paper_type":860,"authors":2281,"abstract":2319},"lrec2018-main-073","A Multilingual Wikified Data Set of Educational Material","10.63317\u002F4fdi63ucw54h","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-073","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F515.pdf","hendrickx-etal-2018-multilingual",[2282,2285,2288,2291,2294,2297,2300,2303,2306,2309,2312,2315,2316],{"paper_id":2275,"author_seq":247,"given_name":2283,"surname":2284,"affiliation":63,"orcid":63},"Iris","Hendrickx",{"paper_id":2275,"author_seq":232,"given_name":2286,"surname":2287,"affiliation":63,"orcid":63},"Eirini","Takoulidou",{"paper_id":2275,"author_seq":218,"given_name":2289,"surname":2290,"affiliation":63,"orcid":63},"Thanasis","Naskos",{"paper_id":2275,"author_seq":203,"given_name":2292,"surname":2293,"affiliation":63,"orcid":63},"Katia Lida","Kermanidis",{"paper_id":2275,"author_seq":188,"given_name":2295,"surname":2296,"affiliation":63,"orcid":63},"Vilelmini","Sosoni",{"paper_id":2275,"author_seq":172,"given_name":2298,"surname":2299,"affiliation":63,"orcid":63},"Hugo","de Vos",{"paper_id":2275,"author_seq":155,"given_name":2301,"surname":2302,"affiliation":63,"orcid":63},"Maria","Stasimioti",{"paper_id":2275,"author_seq":138,"given_name":2304,"surname":2305,"affiliation":63,"orcid":63},"Menno","van Zaanen",{"paper_id":2275,"author_seq":121,"given_name":2307,"surname":2308,"affiliation":63,"orcid":63},"Panayota","Georgakopoulou",{"paper_id":2275,"author_seq":104,"given_name":2310,"surname":2311,"affiliation":63,"orcid":63},"Valia","Kordoni",{"paper_id":2275,"author_seq":87,"given_name":2313,"surname":2314,"affiliation":63,"orcid":63},"Maja","Popovic",{"paper_id":2275,"author_seq":73,"given_name":1771,"surname":1814,"affiliation":63,"orcid":63},{"paper_id":2275,"author_seq":55,"given_name":2317,"surname":2318,"affiliation":63,"orcid":63},"Antal","van den Bosch","We present a parallel wikified data set of parallel texts in eleven language pairs from the educational domain. English sentences are lined up to sentences in eleven other languages (BG, CS, DE, EL, HR, IT, NL, PL, PT, RU, ZH) where names and noun phrases (entities) are manually annotated and linked to their respective Wikipedia pages. For every linked entity in English, the corresponding term or phrase in the target language is also marked and linked to its Wikipedia page in that language. The annotation process was performed via crowdsourcing. In this paper we present the task, annotation process, the encountered difficulties with crowdsourcing for complex annotation, and the data set in more detail. We demonstrate the usage of the data set for Wikification evaluation. This data set is valuable as it constitutes a rich resource consisting of annotated data of English text linked to translations in eleven languages including several languages such as Bulgarian and Greek for which not many LT resources are available.",{"paper_id":2321,"title":2322,"year":81,"month":855,"day":63,"doi":2323,"resource_url":2324,"first_page":63,"last_page":63,"pdf_url":2325,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2326,"paper_type":860,"authors":2327,"abstract":2337},"lrec2018-main-074","Using Crowd Agreement for Wordnet Localization","10.63317\u002F4axeudj8s5ni","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-074","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F582.pdf","ganbold-etal-2018-using",[2328,2331,2334],{"paper_id":2321,"author_seq":247,"given_name":2329,"surname":2330,"affiliation":63,"orcid":63},"Amarsanaa","Ganbold",{"paper_id":2321,"author_seq":232,"given_name":2332,"surname":2333,"affiliation":63,"orcid":63},"Altangerel","Chagnaa",{"paper_id":2321,"author_seq":218,"given_name":2335,"surname":2336,"affiliation":63,"orcid":63},"Gábor","Bella","Building a wordnet from scratch is a huge task, especially for languages less equipped with pre-existing lexical resources such as thesauri or bilingual dictionaries. We address the issue of costliness of human supervision through crowdsourcing that offers a good trade-off between quality of output and speed of progress. In this paper, we demonstrate a two-phase crowdsourcing workflow that consists of a synset localization step followed by a validation step. Validation is performed using the inter-rater agreement metrics Fleiss’ kappa and Krippendorf’s alpha, which allow us to estimate the precision of the result, as well as to set a balance between precision and recall. In our experiment, 947 synsets were localized from English to Mongolian and evaluated through crowdsourcing with the precision of 0.74.",{"paper_id":2339,"title":2340,"year":81,"month":855,"day":63,"doi":2341,"resource_url":2342,"first_page":63,"last_page":63,"pdf_url":2343,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2344,"paper_type":860,"authors":2345,"abstract":2358},"lrec2018-main-075","Translation Crowdsourcing: Creating a Multilingual Corpus of Online Educational Content","10.63317\u002F2fsnu3dm2m7w","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-075","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F677.pdf","sosoni-etal-2018-translation",[2346,2347,2348,2349,2350,2351,2352,2355,2356,2357],{"paper_id":2339,"author_seq":247,"given_name":2295,"surname":2296,"affiliation":63,"orcid":63},{"paper_id":2339,"author_seq":232,"given_name":2292,"surname":2293,"affiliation":63,"orcid":63},{"paper_id":2339,"author_seq":218,"given_name":2301,"surname":2302,"affiliation":63,"orcid":63},{"paper_id":2339,"author_seq":203,"given_name":2289,"surname":2290,"affiliation":63,"orcid":63},{"paper_id":2339,"author_seq":188,"given_name":2286,"surname":2287,"affiliation":63,"orcid":63},{"paper_id":2339,"author_seq":172,"given_name":2304,"surname":2305,"affiliation":63,"orcid":63},{"paper_id":2339,"author_seq":155,"given_name":2353,"surname":2354,"affiliation":63,"orcid":63},"Sheila","Castilho",{"paper_id":2339,"author_seq":138,"given_name":2307,"surname":2308,"affiliation":63,"orcid":63},{"paper_id":2339,"author_seq":121,"given_name":2310,"surname":2311,"affiliation":63,"orcid":63},{"paper_id":2339,"author_seq":104,"given_name":1771,"surname":1814,"affiliation":63,"orcid":63},"The present work describes a multilingual corpus of online content in the educational domain, i.e. Massive Open Online Course material, ranging from course forum text to subtitles of online video lectures, that has been developed via large-scale crowdsourcing. The English source text is manually translated into 11 European and BRIC languages using the CrowdFlower platform. During the process several challenges arose which mainly involved the in-domain text genre, the large text volume, the idiosyncrasies of each target language, the limitations of the crowdsourcing platform, as well as the quality assurance and workflow issues of the crowdsourcing process. The corpus constitutes a product of the EU-funded TraMOOC project and is utilised in the project in order to train, tune and test machine translation engines.",{"paper_id":2360,"title":2361,"year":81,"month":855,"day":63,"doi":2362,"resource_url":2363,"first_page":63,"last_page":63,"pdf_url":2364,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2365,"paper_type":860,"authors":2366,"abstract":2370},"lrec2018-main-076","Building an English Vocabulary Knowledge Dataset of Japanese English-as-a-Second-Language Learners Using Crowdsourcing","10.63317\u002F2fc9k5vsp9yk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-076","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F978.pdf","ehara-2018-building",[2367],{"paper_id":2360,"author_seq":247,"given_name":2368,"surname":2369,"affiliation":63,"orcid":63},"Yo","Ehara","We introduce a freely available dataset for analyzing the English vocabulary of English-as-a-second language (ESL) learners. While ESL vocabulary tests have been extensively studied, few of the results have been made public. This is probably because 1) most of the tests are used to grade test takers, i.e., placement tests; thus, they are treated as private information that should not be leaked, and 2) the primary focus of most language-educators is how to measure their students' ESL vocabulary, rather than the test results of the other test takers. However, to build and evaluate systems to support language learners, we need a dataset that records the learners' vocabulary. Our dataset meets this need. It contains the results of the vocabulary size test, a well-studied English vocabulary test, by one hundred test takers hired via crowdsourcing. Unlike high-stakes testing, the test takers of our dataset were not motivated to cheat on the tests to obtain high scores. This setting is similar to that of typical language-learning support systems. Brief test-theory analysis on the dataset showed an excellent test reliability of $0.91$ (Chronbach's alpha). Analysis using item response theory also indicates that the test is reliable and successfully measures the vocabulary ability of language learners. We also measured how well the responses from the learners can be predicted with high accuracy using machine-learning methods.",{"paper_id":2372,"title":2373,"year":81,"month":855,"day":63,"doi":2374,"resource_url":2375,"first_page":63,"last_page":63,"pdf_url":2376,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2377,"paper_type":860,"authors":2378,"abstract":2385},"lrec2018-main-077","Chinese Relation Classification using Long Short Term Memory Networks","10.63317\u002F26mxtzdgbbbo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-077","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F153.pdf","zhang-moldovan-2018-chinese",[2379,2382],{"paper_id":2372,"author_seq":247,"given_name":2380,"surname":2381,"affiliation":63,"orcid":63},"Linrui","Zhang",{"paper_id":2372,"author_seq":232,"given_name":2383,"surname":2384,"affiliation":63,"orcid":63},"Dan","Moldovan","Relation classification is the task to predict semantic relations between pairs of entities in a given text. In this paper, a novel Long Short Term Memory Network (LSTM)-based approach is proposed to extract relations between entities in Chinese text. The shortest dependency path (SDP) between two entities, together with the various selected features in the path, are ﬁrst extracted, and then used as input of an LSTM model to predict the relation between them. The performance of the system was evaluated on the ACE 2005 Multilingual Training Corpus, and achieved a state-of-the-art F-measure of 87.87% on six general type relations and 83.40% on eighteen subtype relations in this corpus.",{"paper_id":2387,"title":2388,"year":81,"month":855,"day":63,"doi":2389,"resource_url":2390,"first_page":63,"last_page":63,"pdf_url":2391,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2392,"paper_type":860,"authors":2393,"abstract":2414},"lrec2018-main-078","The UIR Uncertainty Corpus for Chinese: Annotating Chinese Microblog Corpus for Uncertainty Identification from Social Media","10.63317\u002F2t8j6nc3r8wo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-078","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F208.pdf","li-etal-2018-uir",[2394,2396,2399,2402,2404,2407,2409,2411],{"paper_id":2387,"author_seq":247,"given_name":2395,"surname":1591,"affiliation":63,"orcid":63},"Binyang",{"paper_id":2387,"author_seq":232,"given_name":2397,"surname":2398,"affiliation":63,"orcid":63},"Jun","Xiang",{"paper_id":2387,"author_seq":218,"given_name":2400,"surname":2401,"affiliation":63,"orcid":63},"Le","Chen",{"paper_id":2387,"author_seq":203,"given_name":1585,"surname":2403,"affiliation":63,"orcid":63},"Han",{"paper_id":2387,"author_seq":188,"given_name":2405,"surname":2406,"affiliation":63,"orcid":63},"Xiaoyan","Yu",{"paper_id":2387,"author_seq":172,"given_name":2408,"surname":1585,"affiliation":63,"orcid":63},"Ruifeng",{"paper_id":2387,"author_seq":155,"given_name":2410,"surname":1588,"affiliation":63,"orcid":63},"Tengjiao",{"paper_id":2387,"author_seq":138,"given_name":2412,"surname":2413,"affiliation":63,"orcid":63},"Kam-fai","Wong","Uncertainty identification is an important semantic processing task, which is critical to the quality of information in terms of factuality in many NLP techniques and applications, such as question answering, information extraction, and so on. Especially in social media, the factuality becomes a primary concern, because the social media texts are usually written wildly. The lack of open uncertainty corpus for Chinese social media contexts bring limitations for many social media oriented applications. In this work, we present the first open uncertainty corpus of microblogs in Chinese, namely, the UIR Uncertainty Corpus (UUC). At current stage, we annontated 40,168 Chinese microblogs from Sina Microblog. The schema of CoNLL 2010 have been adapted, where the corpus contains annotations at each microblog level for uncertainty and 6 sub-classes with 11,071 microblogs under uncertainty. To adapt to the characeristics of social media, we identify the uncertainty based on the contextual uncertan semantics rather than the traditional cue-phrases, and the sub-class could provide more information for research on handing uncertainty in social media texts. The Kappa value indicated that our annotation results were substantially reliable.",{"paper_id":2416,"title":2417,"year":81,"month":855,"day":63,"doi":2418,"resource_url":2419,"first_page":63,"last_page":63,"pdf_url":2420,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2421,"paper_type":860,"authors":2422,"abstract":2439},"lrec2018-main-079","EventWiki: A Knowledge Base of Major Events","10.63317\u002F3oa4g49eb3es","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-079","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F213.pdf","ge-etal-2018-eventwiki",[2423,2426,2428,2430,2433,2436],{"paper_id":2416,"author_seq":247,"given_name":2424,"surname":2425,"affiliation":63,"orcid":63},"Tao","Ge",{"paper_id":2416,"author_seq":232,"given_name":1221,"surname":2427,"affiliation":63,"orcid":63},"Cui",{"paper_id":2416,"author_seq":218,"given_name":2429,"surname":2069,"affiliation":63,"orcid":63},"Baobao",{"paper_id":2416,"author_seq":203,"given_name":2431,"surname":2432,"affiliation":63,"orcid":63},"Zhifang","Sui",{"paper_id":2416,"author_seq":188,"given_name":2434,"surname":2435,"affiliation":63,"orcid":63},"Furu","Wei",{"paper_id":2416,"author_seq":172,"given_name":2437,"surname":2438,"affiliation":63,"orcid":63},"Ming","Zhou","This paper introduces a new resource called EventWiki which is, to the best of our knowledge, the first knowledge base resource of major events. In contrast to most existing knowledge bases that focus on static entities such as people, locations and organizations, our EventWiki concentrate on major events, in which all entries in EventWiki are important events in mankind history. We demonstrate that EventWiki is a very useful resource for information extraction regarding events in Natural Language Processing (NLP), knowledge inference and automatic knowledge base construction.",{"paper_id":2441,"title":2442,"year":81,"month":855,"day":63,"doi":2443,"resource_url":2444,"first_page":63,"last_page":63,"pdf_url":2445,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2446,"paper_type":860,"authors":2447,"abstract":2453},"lrec2018-main-080","Annotating Spin in Biomedical Scientific Publications : the case of Random Controlled Trials (RCTs)","10.63317\u002F4ydwv7555nce","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-080","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F278.pdf","koroleva-paroubek-2018-annotating",[2448,2450],{"paper_id":2441,"author_seq":247,"given_name":884,"surname":2449,"affiliation":63,"orcid":63},"Koroleva",{"paper_id":2441,"author_seq":232,"given_name":2451,"surname":2452,"affiliation":63,"orcid":63},"Patrick","Paroubek","In this paper we report on the collection in the context of the MIROR project of a corpus of biomedical articles for the task of automatic detection of inadequate claims (spin), which to our knowledge has never been addressed before.  We present the manual        annotation model  and its annotation guidelines and describe the planned machine learning experiments and evaluations.",{"paper_id":2455,"title":2456,"year":81,"month":855,"day":63,"doi":2457,"resource_url":2458,"first_page":63,"last_page":63,"pdf_url":2459,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2460,"paper_type":860,"authors":2461,"abstract":2476},"lrec2018-main-081","Visualization of the occurrence trend of infectious diseases using Twitter","10.63317\u002F2yehbaqj6yoq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-081","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F298.pdf","matsumoto-etal-2018-visualization",[2462,2465,2468,2470,2473],{"paper_id":2455,"author_seq":247,"given_name":2463,"surname":2464,"affiliation":63,"orcid":63},"Ryusei","Matsumoto",{"paper_id":2455,"author_seq":232,"given_name":2466,"surname":2467,"affiliation":63,"orcid":63},"Minoru","Yoshida",{"paper_id":2455,"author_seq":218,"given_name":2469,"surname":2464,"affiliation":63,"orcid":63},"Kazuyuki",{"paper_id":2455,"author_seq":203,"given_name":2471,"surname":2472,"affiliation":63,"orcid":63},"Hironobu","Matsuda",{"paper_id":2455,"author_seq":188,"given_name":2474,"surname":2475,"affiliation":63,"orcid":63},"Kenji","Kita","We propose a system for visualizing the epidemics of infectious diseases. We apply factuality analysis both to disease detection and location estimation for accurate visualization. We tested our methods for several infectious diseases, and show that our method performs well on various diseases.",{"paper_id":2478,"title":2479,"year":81,"month":855,"day":63,"doi":2480,"resource_url":2481,"first_page":63,"last_page":63,"pdf_url":2482,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2483,"paper_type":860,"authors":2484,"abstract":2491},"lrec2018-main-082","Reusable workflows for gender prediction","10.63317\u002F3o52wzf3cjxx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-082","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F310.pdf","martinc-pollak-2018-reusable",[2485,2488],{"paper_id":2478,"author_seq":247,"given_name":2486,"surname":2487,"affiliation":63,"orcid":63},"Matej","Martinc",{"paper_id":2478,"author_seq":232,"given_name":2489,"surname":2490,"affiliation":63,"orcid":63},"Senja","Pollak","This paper presents a system for author profiling (AP) modeling that reduces the complexity and time of building a sophisticated model for a number of different AP tasks. The system is implemented in a cloud-based visual programming platform ClowdFlows and is publicly available to a wider audience. In the platform, we also implemented our already existing state of the art gender prediction model and tested it on a number of cross-genre tasks. The results show that the implemented model, which was trained on tweets, achieves results comparable to state of the art models for cross-genre gender prediction. There is however a noticeable decrease in accuracy when the genre of a test set is different from the genre of the train set.",{"paper_id":2493,"title":2494,"year":81,"month":855,"day":63,"doi":2495,"resource_url":2496,"first_page":63,"last_page":63,"pdf_url":2497,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2498,"paper_type":860,"authors":2499,"abstract":2506},"lrec2018-main-083","Knowing the Author by the Company His Words Keep","10.63317\u002F5imfijvdyr5x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-083","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F349.pdf","hoenen-schenk-2018-knowing",[2500,2503],{"paper_id":2493,"author_seq":247,"given_name":2501,"surname":2502,"affiliation":63,"orcid":63},"Armin","Hoenen",{"paper_id":2493,"author_seq":232,"given_name":2504,"surname":2505,"affiliation":63,"orcid":63},"Niko","Schenk","In this paper, we analyze relationships between word pairs and evaluate their idiosyncratic properties in the applied context of authorship attribution. Specifically, on three literary corpora we optimize word pair features for information gain which reflect word similarity as measured by word embeddings. We analyze the quality of the most informative features in terms of word type relation (a comparison of different constellations of function and content words), similarity, and relatedness. Results point to the extraordinary role of function words within the authorship attribution task being extended to their pairwise relational patterns. Similarity of content words is likewise among the most informative features. From a cognitive perspective, we conclude that both relationship types reflect short distance connections in the human brain, which is highly indicative of an individual writing style.",{"paper_id":2508,"title":2509,"year":81,"month":855,"day":63,"doi":2510,"resource_url":2511,"first_page":63,"last_page":63,"pdf_url":2512,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2513,"paper_type":860,"authors":2514,"abstract":2520},"lrec2018-main-084","Towards a Gold Standard Corpus for Variable Detection and Linking in Social Science Publications","10.63317\u002F46iujrge2mow","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-084","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F368.pdf","zielinski-mutschke-2018-towards",[2515,2518],{"paper_id":2508,"author_seq":247,"given_name":2516,"surname":2517,"affiliation":63,"orcid":63},"Andrea","Zielinski",{"paper_id":2508,"author_seq":232,"given_name":1474,"surname":2519,"affiliation":63,"orcid":63},"Mutschke","this paper, we describe our effort to create a new corpus for the evaluation of detecting and linking so-called survey variables in social science publications  (e.g. \"Do you believe in Heaven?\"). The task is to recognize survey variable mentions in a given text, disambiguate them, and link them to the corresponding variable within a knowledge base. Since there are generally hundreds of candidates to link to and due to the wide variety of forms they can take, this is a challenging task within NLP. The contribution of our work is the first gold standard corpus for the variable detection task.  We describe the annotation guidelines and the annotation process. The produced corpus is multilingual - German and English - and includes manually curated word and phrase alignments.  Moreover, it includes text samples that could not be assigned to any variables, denoted as negative examples. Based on the new dataset, we conduct an evaluation of several state-of-the-art text classification and textual similarity methods. The annotated corpus is made available along with an open-source baseline system for variable mention identification and linking.",{"paper_id":2522,"title":2523,"year":81,"month":855,"day":63,"doi":2524,"resource_url":2525,"first_page":63,"last_page":63,"pdf_url":2526,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2527,"paper_type":860,"authors":2528,"abstract":2544},"lrec2018-main-085","KRAUTS: A German Temporally Annotated News Corpus","10.63317\u002F4ap8wacz3r76","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-085","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F436.pdf","strotgen-etal-2018-krauts",[2529,2532,2535,2538,2541],{"paper_id":2522,"author_seq":247,"given_name":2530,"surname":2531,"affiliation":63,"orcid":63},"Jannik","Strötgen",{"paper_id":2522,"author_seq":232,"given_name":2533,"surname":2534,"affiliation":63,"orcid":63},"Anne-Lyse","Minard",{"paper_id":2522,"author_seq":218,"given_name":2536,"surname":2537,"affiliation":63,"orcid":63},"Lukas","Lange",{"paper_id":2522,"author_seq":203,"given_name":2539,"surname":2540,"affiliation":63,"orcid":63},"Manuela","Speranza",{"paper_id":2522,"author_seq":188,"given_name":2542,"surname":2543,"affiliation":63,"orcid":63},"Bernardo","Magnini","In recent years, temporal tagging, i.e., the extraction and normalization of temporal expressions, has become a vivid research area. Several tools have been made available, and new strategies have been developed. Due to domain-specific challenges, evaluations of new methods should be performed on diverse text types. Despite significant efforts towards multilinguality in the context of temporal tagging, for all languages except English, annotated corpora exist only for a single domain. In the case of German, for example, only a narrative-style corpus has been manually annotated so far, thus no evaluations of German temporal tagging performance on news articles can be made. In this paper, we present KRAUTS, a new German temporally annotated corpus containing two subsets of news documents: articles from the daily newspaper Dolomiten and from the weekly newspaper Die Zeit. Overall, the corpus contains 192 documents with 1,140 annotated temporal expressions, and has been made publicly available to further boost research in temporal tagging.",{"paper_id":2546,"title":2547,"year":81,"month":855,"day":63,"doi":2548,"resource_url":2549,"first_page":63,"last_page":63,"pdf_url":2550,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2551,"paper_type":860,"authors":2552,"abstract":2623},"lrec2018-main-086","CogCompNLP: Your Swiss Army Knife for NLP","10.63317\u002F3yg27frq39yx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-086","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F157.pdf","khashabi-etal-2018-cogcompnlp",[2553,2556,2558,2560,2563,2564,2567,2570,2573,2576,2579,2582,2585,2588,2591,2593,2596,2600,2604,2608,2612,2616,2620],{"paper_id":2546,"author_seq":247,"given_name":2554,"surname":2555,"affiliation":63,"orcid":63},"Daniel","Khashabi",{"paper_id":2546,"author_seq":232,"given_name":1364,"surname":2557,"affiliation":63,"orcid":63},"Sammons",{"paper_id":2546,"author_seq":218,"given_name":2559,"surname":2438,"affiliation":63,"orcid":63},"Ben",{"paper_id":2546,"author_seq":203,"given_name":2561,"surname":2562,"affiliation":63,"orcid":63},"Tom","Redman",{"paper_id":2546,"author_seq":188,"given_name":1523,"surname":1524,"affiliation":63,"orcid":63},{"paper_id":2546,"author_seq":172,"given_name":2565,"surname":2566,"affiliation":63,"orcid":63},"Vivek","Srikumar",{"paper_id":2546,"author_seq":155,"given_name":2568,"surname":2569,"affiliation":63,"orcid":63},"Nicholas","Rizzolo",{"paper_id":2546,"author_seq":138,"given_name":2571,"surname":2572,"affiliation":63,"orcid":63},"Lev","Ratinov",{"paper_id":2546,"author_seq":121,"given_name":2574,"surname":2575,"affiliation":63,"orcid":63},"Guanheng","Luo",{"paper_id":2546,"author_seq":104,"given_name":2577,"surname":2578,"affiliation":63,"orcid":63},"Quang","Do",{"paper_id":2546,"author_seq":87,"given_name":2580,"surname":2581,"affiliation":63,"orcid":63},"Chen-Tse","Tsai",{"paper_id":2546,"author_seq":73,"given_name":2583,"surname":2584,"affiliation":63,"orcid":63},"Subhro","Roy",{"paper_id":2546,"author_seq":55,"given_name":2586,"surname":2587,"affiliation":63,"orcid":63},"Stephen","Mayhew",{"paper_id":2546,"author_seq":38,"given_name":2589,"surname":2590,"affiliation":63,"orcid":63},"Zhili","Feng",{"paper_id":2546,"author_seq":17,"given_name":1734,"surname":2592,"affiliation":63,"orcid":63},"Wieting",{"paper_id":2546,"author_seq":2594,"given_name":2595,"surname":2406,"affiliation":63,"orcid":63},"16","Xiaodong",{"paper_id":2546,"author_seq":2597,"given_name":2598,"surname":2599,"affiliation":63,"orcid":63},"17","Yangqiu","Song",{"paper_id":2546,"author_seq":2601,"given_name":2602,"surname":2603,"affiliation":63,"orcid":63},"18","Shashank","Gupta",{"paper_id":2546,"author_seq":2605,"given_name":2606,"surname":2607,"affiliation":63,"orcid":63},"19","Shyam","Upadhyay",{"paper_id":2546,"author_seq":2609,"given_name":2610,"surname":2611,"affiliation":63,"orcid":63},"20","Naveen","Arivazhagan",{"paper_id":2546,"author_seq":2613,"given_name":2614,"surname":2615,"affiliation":63,"orcid":63},"21","Qiang","Ning",{"paper_id":2546,"author_seq":2617,"given_name":2618,"surname":2619,"affiliation":63,"orcid":63},"22","Shaoshi","Ling",{"paper_id":2546,"author_seq":2621,"given_name":2383,"surname":2622,"affiliation":63,"orcid":63},"23","Roth","Implementing a Natural Language Processing (NLP) system requires considerable engineering effort:  creating data-structures to represent  language  constructs;  reading  corpora  annotations  into  these  data-structures;  applying  off-the-shelf  NLP  tools  to  augment  the text representation; extracting features and training machine learning components; conducting experiments and computing performance statistics; and creating the end-user application that integrates the implemented components.  While there are several widely used NLP libraries, each provides only partial coverage of these various tasks. We present our library COGCOMPNLP  which simplifies the process of design and development of NLP applications by providing modules to address different challenges: we provide a corpus-reader module that supports popular corpora in the NLP community, a module for various low-level data-structures and operations (such as search over  text),  a  module  for  feature  extraction,  and  an  extensive  suite of  annotation modules for a wide  range  of  semantic  and syntactic tasks.  These annotation modules are all integrated in a single system, PIPELINE, which allows users to easily use the annotators with simple direct calls using any JVM-based language, or over a network.  The sister project COGCOMPNLPYenables users to access the annotators with a Python interface.   We give a detailed account of our system’s structure and usage,  and where possible, compare it with other established NLP frameworks.  We report on the performance, including time and memory statistics, of each component on a selection of well-established datasets.  Our system is publicly available for research use and external contributions, at:http:\u002F\u002Fgithub.com\u002FCogComp\u002Fcogcomp-nlp",{"paper_id":2625,"title":2626,"year":81,"month":855,"day":63,"doi":2627,"resource_url":2628,"first_page":63,"last_page":63,"pdf_url":2629,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2630,"paper_type":860,"authors":2631,"abstract":2637},"lrec2018-main-087","A Framework for the Needs of Different Types of Users in Multilingual Semantic Enrichment","10.63317\u002F3qv9yfu4ffaj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-087","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F262.pdf","nehring-sasaki-2018-framework",[2632,2635],{"paper_id":2625,"author_seq":247,"given_name":2633,"surname":2634,"affiliation":63,"orcid":63},"Jan","Nehring",{"paper_id":2625,"author_seq":232,"given_name":1211,"surname":2636,"affiliation":63,"orcid":63},"Sasaki","The FREME framework bridges Language Technologies (LT) and Linked Data (LD). It establishes workflows between LT and LD in a well defined, coherent way. FREME addresses common challenges that both researchers and industry face when integrating LT and LD: interoperability, \"silo\" solutions and the lack of adequate tooling.  Usability, reusability and interoperability are often attributes of frameworks and toolkits for LT and LD. In this paper, we take a novel approach: We define user types and user levels and describe how they influence design decisions in a LT and LD processing framework. In this way, we combine research outcomes from various communities: language technology, linked data and software interface engineering. This paper explains the different user types and how FREME addresses the specific needs of each user type. Core attributes of FREME are usability, reusability and interoperability",{"paper_id":2639,"title":2640,"year":81,"month":855,"day":63,"doi":2641,"resource_url":2642,"first_page":63,"last_page":63,"pdf_url":2643,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2644,"paper_type":860,"authors":2645,"abstract":2658},"lrec2018-main-088","The LREC Workshops Map","10.63317\u002F43jpjuqzpngb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-088","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F639.pdf","bartolini-etal-2018-lrec",[2646,2649,2652,2655],{"paper_id":2639,"author_seq":247,"given_name":2647,"surname":2648,"affiliation":63,"orcid":63},"Roberto","Bartolini",{"paper_id":2639,"author_seq":232,"given_name":2650,"surname":2651,"affiliation":63,"orcid":63},"Sara","Goggi",{"paper_id":2639,"author_seq":218,"given_name":2653,"surname":2654,"affiliation":63,"orcid":63},"Monica","Monachini",{"paper_id":2639,"author_seq":203,"given_name":2656,"surname":2657,"affiliation":63,"orcid":63},"Gabriella","Pardelli","The aim of this work is to present an overview of the research presented at the LREC workshops over the years 1998-2016 with the aim to shed light on the community represented by workshop participants in terms of country of origin, type of affiliation, gender. There has been also an effort towards the identification of the major topics dealt with as well as of the terminological variations noticed in this time span. Data has been retrieved from the portal of the European Language Resources Association (ELRA) which organizes the conference and the resulting corpus made up of workshops titles and of the related presentations has then been processed using a term extraction tool developed at ILC-CNR.",{"paper_id":2660,"title":2661,"year":81,"month":855,"day":63,"doi":2662,"resource_url":2663,"first_page":63,"last_page":63,"pdf_url":2664,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2665,"paper_type":860,"authors":2666,"abstract":2674},"lrec2018-main-089","Preserving Workflow Reproducibility: The RePlay-DH Client as a Tool for Process Documentation","10.63317\u002F5p8r86btyokb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-089","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F707.pdf","gartner-etal-2018-preserving",[2667,2669,2671],{"paper_id":2660,"author_seq":247,"given_name":1771,"surname":2668,"affiliation":63,"orcid":63},"Gärtner",{"paper_id":2660,"author_seq":232,"given_name":2670,"surname":1452,"affiliation":63,"orcid":63},"Uli",{"paper_id":2660,"author_seq":218,"given_name":2672,"surname":2673,"affiliation":63,"orcid":63},"Sibylle","Hermann","In this paper we present a software tool for elicitation and management of process metadata. It follows our previously published design idea of an assistant for researchers that aims at minimizing the additional effort required for producing a sustainable workflow documentation. With the ever-growing number of linguistic resources available, it also becomes increasingly important to provide proper documentation to make them comparable and to allow meaningful evaluations for specific use cases. The often prevailing practice of post hoc documentation of resource generation or research processes bears the risk of information loss. Not only does detailed documentation of a process aid in achieving reproducibility, it also increases usefulness of the documented work for others as a cornerstone of good scientific practice. Time pressure together with the lack of simple documentation methods leads to workflow documentation in practice being an arduous and often neglected task. Our tool ensures a clean documentation for common workflows in natural language processing and digital humanities. Additionally, it can easily be integrated into existing institutional infrastructures.",{"paper_id":2676,"title":2677,"year":81,"month":855,"day":63,"doi":2678,"resource_url":2679,"first_page":63,"last_page":63,"pdf_url":2680,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2681,"paper_type":860,"authors":2682,"abstract":2686},"lrec2018-main-090","The ACoLi CoNLL Libraries: Beyond Tab-Separated Values","10.63317\u002F39odsydd9zyd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-090","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F869.pdf","chiarcos-schenk-2018-acoli",[2683,2685],{"paper_id":2676,"author_seq":247,"given_name":904,"surname":2684,"affiliation":63,"orcid":63},"Chiarcos",{"paper_id":2676,"author_seq":232,"given_name":2504,"surname":2505,"affiliation":63,"orcid":63},"We introduce the ACoLi CoNLL libraries, a set of Java archives to facilitate advanced manipulations of corpora annotated in TSV formats, including all members of the CoNLL format family. In particular, we provide means for (i) rule-based re-write operations, (ii) visualization and manual annotation, (iii) merging CoNLL files, and (iv) data base support. The ACoLi CoNLL libraries provide command-line interface to these functionalities. The following aspects are technologically innovative and exceed beyond the state of the art:  We support \\emph{every} OWPL (one word per line) corpus format with tab-separated columns, whereas most existing tools are specific to one particular CoNLL dialect. We employ established W3C standards for rule-based graph rewriting operations on CoNLL sentences. We provide means for the heuristic, but fully automated merging of CoNLL annotations of the same textual content, in particular for resolving  conflicting tokenizations. We demonstrate the usefulness and practicability of our proposed CoNLL libraries on well-established data sets of the Universal Dependency corpus and the Penn Treebank.",{"paper_id":2688,"title":2689,"year":81,"month":855,"day":63,"doi":2690,"resource_url":2691,"first_page":63,"last_page":63,"pdf_url":2692,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2693,"paper_type":860,"authors":2694,"abstract":2704},"lrec2018-main-091","What’s Wrong, Python? – A Visual Differ and Graph Library for NLP in Python","10.63317\u002F3htbvyr8ckwm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-091","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F886.pdf","indig-etal-2018-whats",[2695,2698,2701],{"paper_id":2688,"author_seq":247,"given_name":2696,"surname":2697,"affiliation":63,"orcid":63},"Balázs","Indig",{"paper_id":2688,"author_seq":232,"given_name":2699,"surname":2700,"affiliation":63,"orcid":63},"András","Simonyi",{"paper_id":2688,"author_seq":218,"given_name":2702,"surname":2703,"affiliation":63,"orcid":63},"Noémi","Ligeti-Nagy","The correct analysis of the output of a program based on supervised learning is inevitable in order to be able to identify the errors it produced and characterise its error types. This task is fairly difficult without a proper tool, especially if one works with complex data structures such as parse trees or sentence alignments. In this paper, we present a library that allows the user to interactively visualise and compare the output of any program that yields a well-known data format. Our goal is to create a tool granting the total control of the visualisation to the user, including extensions, but also have the common primitives and data-formats implemented for typical cases. We describe the common features of the common NLP tasks from the viewpoint of visualisation in order to specify the essential primitive functions. We enumerate many popular off-the-shelf NLP visualisation programs to compare with our implementation, which unifies all of the profitable features of the existing programs adding extendibility as a crucial feature to them.",{"paper_id":2706,"title":2707,"year":81,"month":855,"day":63,"doi":2708,"resource_url":2709,"first_page":63,"last_page":63,"pdf_url":2710,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2711,"paper_type":860,"authors":2712,"abstract":2723},"lrec2018-main-092","ScholarGraph:a Chinese Knowledge Graph of Chinese Scholars","10.63317\u002F4876njwco5nu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-092","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F144.pdf","wang-etal-2018-scholargraph",[2713,2715,2718,2721],{"paper_id":2706,"author_seq":247,"given_name":2714,"surname":1588,"affiliation":63,"orcid":63},"Shuo",{"paper_id":2706,"author_seq":232,"given_name":2716,"surname":2717,"affiliation":63,"orcid":63},"Zehui","Hao",{"paper_id":2706,"author_seq":218,"given_name":2719,"surname":2720,"affiliation":63,"orcid":63},"Xiaofeng","Meng",{"paper_id":2706,"author_seq":203,"given_name":2722,"surname":1588,"affiliation":63,"orcid":63},"Qiuyue","Scholars and their academic information are widely distributed on the Web. Integrating these information and making association between them can play a catalytic role in academic evaluation and research. Since 2008, Web and Mobile Data Management Laboratory (WAMDM) in Renmin University of China began to collect Chinese literatures in more than 20 academic domains, and build a data integration system called ScholarSpace to automatically integrate the relevant chinese academic information from the chinese scholars and science. Focusing on the chinese scholars, ScholarSpace can give you an academic portrait about a chinese scholar with the form of knowledge graph. So the ScholarSpace can be tranformed into a knowledge graph called ScholarGraph. It includes the scholar information such as the affiliation, publications, teacher-student relationship, etc. ScholarGraph is a subset of the whole knowledge graph generated from the ScholarSpace and is published on the web page of WAMDM. ScholarGraph consists of more than 10,000,000 triples, including more than 9,000,000 entities and 6 relations. It can support the search and query about portrait of Chinese scholars and other relevant applications.",{"paper_id":2725,"title":2726,"year":81,"month":855,"day":63,"doi":2727,"resource_url":2728,"first_page":63,"last_page":63,"pdf_url":2729,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2730,"paper_type":860,"authors":2731,"abstract":2743},"lrec2018-main-093","Enriching Frame Representations with Distributionally Induced Senses","10.63317\u002F4nuyge6ttv3c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-093","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F263.pdf","faralli-etal-2018-enriching",[2732,2735,2738,2740],{"paper_id":2725,"author_seq":247,"given_name":2733,"surname":2734,"affiliation":63,"orcid":63},"Stefano","Faralli",{"paper_id":2725,"author_seq":232,"given_name":2736,"surname":2737,"affiliation":63,"orcid":63},"Alexander","Panchenko",{"paper_id":2725,"author_seq":218,"given_name":1367,"surname":2739,"affiliation":63,"orcid":63},"Biemann",{"paper_id":2725,"author_seq":203,"given_name":2741,"surname":2742,"affiliation":63,"orcid":63},"Simone Paolo","Ponzetto","We introduce a new lexical resource that enriches the Framester knowledge graph, which links Framnet, WordNet, VerbNet and other resources, with semantic features from text corpora. These features are extracted from distributionally induced sense inventories and subsequently linked to the manually-constructed frame representations to boost the performance of frame disambiguation in context. Since Framester is a frame-based knowledge graph, which enables full-fledged OWL querying and reasoning, our resource paves the way for the development of novel, deeper semantic-aware applications that could benefit from the combination of knowledge from text and complex symbolic representations of events and participants. Together with the resource we also provide the software we developed for the evaluation in the task of Word Frame Disambiguation (WFD).",{"paper_id":2745,"title":2746,"year":81,"month":855,"day":63,"doi":2747,"resource_url":2748,"first_page":63,"last_page":63,"pdf_url":2749,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2750,"paper_type":860,"authors":2751,"abstract":2759},"lrec2018-main-094","An Integrated Formal Representation for Terminological and Lexical Data included in Classification Schemes","10.63317\u002F5p6eemyn46on","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-094","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F287.pdf","declerck-etal-2018-integrated",[2752,2753,2756],{"paper_id":2745,"author_seq":247,"given_name":881,"surname":1555,"affiliation":63,"orcid":63},{"paper_id":2745,"author_seq":232,"given_name":2754,"surname":2755,"affiliation":63,"orcid":63},"Kseniya","Egorova",{"paper_id":2745,"author_seq":218,"given_name":2757,"surname":2758,"affiliation":63,"orcid":63},"Eileen","Schnur","This paper presents our work dealing with a potential application in e-lexicography: the automatized creation of specialized multilingual dictionaries from structured data, which are available in the form of comparable multilingual classification schemes or taxonomies. As starting examples, we use comparable industry classification schemes, which frequently occur in the context of stock exchanges and business reports. Initially, we planned to follow an approach based on cross-taxonomies and cross-languages string mapping to automatically detect candidate multilingual dictionary entries for this specific domain. However, the need to first transform the comparable classification schemes into a shared formal representation language in order to be able to properly align their components before implementing the algorithms for the multilingual lexicon extraction soon became apparent. We opted for the SKOS-XL vocabulary for modelling the multilingual terminological part of the comparable taxonomies and for OntoLex-Lemon for modelling the multilingual lexical entries which can be extracted from the original data. In this paper, we present the suggested modelling architecture, which demonstrates how terminological elements and lexical items can be formally integrated and explicitly cross-linked in the context of the Linguistic Linked Open Data (LLOD).",{"paper_id":2761,"title":2762,"year":81,"month":855,"day":63,"doi":2763,"resource_url":2764,"first_page":63,"last_page":63,"pdf_url":2765,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2766,"paper_type":860,"authors":2767,"abstract":2777},"lrec2018-main-095","One event, many representations. Mapping action concepts through visual features.","10.63317\u002F3uavidpzxa66","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-095","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F787.pdf","panunzi-etal-2018-one",[2768,2771,2774],{"paper_id":2761,"author_seq":247,"given_name":2769,"surname":2770,"affiliation":63,"orcid":63},"Alessandro","Panunzi",{"paper_id":2761,"author_seq":232,"given_name":2772,"surname":2773,"affiliation":63,"orcid":63},"Lorenzo","Gregori",{"paper_id":2761,"author_seq":218,"given_name":2775,"surname":2776,"affiliation":63,"orcid":63},"Andrea Amelio","Ravelli","This paper faces the problem of unifying the representation of actions and events in different semantic resources. The proposed solution exploits the IMAGACT visual component (video scenes that represent physical actions) as the linkage point among resources. By using visual objects, we connected resources responding to different scopes and theoretical frameworks, in which a concept-to-concept mapping appeared difficult to obtain. We provide a brief description of two experiments that exploit IMAGACT videos as a linkage point: an automatic linking with BabelNet, a multilingual semantic network, and a manual linking with Praxicon, a conceptual knowledge base of action. The aim of this work is to integrate data from resources with different level of granularity in order to describe the action semantics from a linguistic, visual and motor point of view.",{"paper_id":2779,"title":2780,"year":81,"month":855,"day":63,"doi":2781,"resource_url":2782,"first_page":63,"last_page":63,"pdf_url":2783,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2784,"paper_type":860,"authors":2785,"abstract":2789},"lrec2018-main-096","Tel(s)-Telle(s)-Signs: Highly Accurate Automatic Crosslingual Hypernym Discovery","10.63317\u002F4gpfic9dbxzv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-096","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1080.pdf","wan-2018-tel",[2786],{"paper_id":2779,"author_seq":247,"given_name":2787,"surname":2788,"affiliation":63,"orcid":63},"Ada","Wan","We report a highly accurate hypernym discovery heuristic that works on unrestricted texts. This approach leverages morphological cues in French, but given any parallel data and word alignment tool, this proves to be a technique that can work reliably in other languages as well. We tested this method using two French-English corpora of different genres (medical and news) and attained near-perfect accuracy. The key idea is to exploit morphological information in the French trigger phrase 'tel(s)\u002Ftelle(s)- que' (meaning \"such as\" in English) to uniquely identify the correct hypernym. This shows to be an inexpensive and effective heuristic also when there are multiple noun phrases preceding the trigger phrase, as in the case of prepositional phrase attachment causing ambiguity in interpretation and hypernym acquisition, indicating that this pattern in French is more informative than its English counterpart.",{"paper_id":2791,"title":2792,"year":81,"month":855,"day":63,"doi":2793,"resource_url":2794,"first_page":63,"last_page":63,"pdf_url":2795,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2796,"paper_type":860,"authors":2797,"abstract":2806},"lrec2018-main-097","Disambiguation of Verbal Shifters","10.63317\u002F33qg7xyk453v","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-097","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F58.pdf","wiegand-etal-2018-disambiguation",[2798,2800,2803],{"paper_id":2791,"author_seq":247,"given_name":1780,"surname":2799,"affiliation":63,"orcid":63},"Wiegand",{"paper_id":2791,"author_seq":232,"given_name":2801,"surname":2802,"affiliation":63,"orcid":63},"Sylvette","Loda",{"paper_id":2791,"author_seq":218,"given_name":2804,"surname":2805,"affiliation":63,"orcid":63},"Josef","Ruppenhofer","Negation is an important contextual phenomenon that needs to be addressed in sentiment analysis. Next to common negation function words, such as \"not\" or \"none\", there is also a considerably large class of negation content words, also referred to as shifters, such as the verbs \"diminish\", \"reduce\" or \"reverse\". However, many of these shifters are ambiguous. For instance, \"spoil\" as in \"spoil your chance\" reverses the polarity of the positive polar expression \"chance\" while in \"spoil your loved ones\", no negation takes place. We present a supervised learning approach to disambiguating verbal shifters. Our approach takes into consideration various features, particularly generalization features.",{"paper_id":2808,"title":2809,"year":81,"month":855,"day":63,"doi":2810,"resource_url":2811,"first_page":63,"last_page":63,"pdf_url":2812,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2813,"paper_type":860,"authors":2814,"abstract":2821},"lrec2018-main-098","Bootstrapping Polar-Opposite Emotion Dimensions from Online Reviews","10.63317\u002F3wx77w5cu2oc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-098","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F95.pdf","huangfu-surdeanu-2018-bootstrapping",[2815,2818],{"paper_id":2808,"author_seq":247,"given_name":2816,"surname":2817,"affiliation":63,"orcid":63},"Luwen","Huangfu",{"paper_id":2808,"author_seq":232,"given_name":2819,"surname":2820,"affiliation":63,"orcid":63},"Mihai","Surdeanu","We propose a novel bootstrapping approach for the acquisition of lexicons from unannotated, informal online texts (in our case, Yelp reviews) for polar-opposite emotion dimension values from the Ortony\u002FClore\u002FCollins model of emotions (e.g., desirable\u002Fundesirable). Our approach mitigates the intrinsic problem of limited supervision in bootstrapping with an effective strategy that softly labels unlabeled terms, which are then used to better estimate the quality of extraction patterns. Further, we propose multiple solutions to control for semantic drift by taking advantage of the polarity of the categories to be learned (e.g., praiseworthy vs. blameworthy). Experimental results demonstrate that our algorithm achieves considerably better performance than several baselines.",{"paper_id":2823,"title":2824,"year":81,"month":855,"day":63,"doi":2825,"resource_url":2826,"first_page":63,"last_page":63,"pdf_url":2827,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2828,"paper_type":860,"authors":2829,"abstract":2836},"lrec2018-main-099","Sentiment-Stance-Specificity (SSS) Dataset: Identifying Support-based Entailment among Opinions.","10.63317\u002F4wefx3opwth4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-099","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F126.pdf","rajendran-etal-2018-sentiment",[2830,2833,2834],{"paper_id":2823,"author_seq":247,"given_name":2831,"surname":2832,"affiliation":63,"orcid":63},"Pavithra","Rajendran",{"paper_id":2823,"author_seq":232,"given_name":1540,"surname":1541,"affiliation":63,"orcid":63},{"paper_id":2823,"author_seq":218,"given_name":1269,"surname":2835,"affiliation":63,"orcid":63},"Parsons","Computational argumentation aims to model arguments as a set of premises that either support each other or collectively support a conclusion. We prepare three datasets of text-hypothesis pairs with support-based entailment based on opinions present in hotel reviews using a distant supervision approach. Support-based entailment is defined as the existence of a specific opinion (premise) that supports as well as entails a more general opinion and where these together support a generalised conclusion. A set of rules is proposed based on three different components — sentiment, stance and specificity to automatically predict support-based entailment. Two annotators manually annotated the relations among text-hypothesis pairs with an inter-rater agreement of 0.80. We compare the performance of the rules which gave an overall accuracy of 0.83. Further, we compare the performance of textual entailment under various conditions. The overall accuracy was 89.54%, 90.00% and 96.19% for our three datasets.",{"paper_id":2838,"title":2839,"year":81,"month":855,"day":63,"doi":2840,"resource_url":2841,"first_page":63,"last_page":63,"pdf_url":2842,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2843,"paper_type":860,"authors":2844,"abstract":2851},"lrec2018-main-100","Resource Creation Towards Automated Sentiment Analysis in Telugu (a low resource language) and Integrating Multiple Domain Sources to Enhance Sentiment Prediction","10.63317\u002F27s4yiofj5ah","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-100","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F146.pdf","gangula-mamidi-2018-resource",[2845,2848],{"paper_id":2838,"author_seq":247,"given_name":2846,"surname":2847,"affiliation":63,"orcid":63},"Rama Rohit Reddy","Gangula",{"paper_id":2838,"author_seq":232,"given_name":2849,"surname":2850,"affiliation":63,"orcid":63},"Radhika","Mamidi","Understanding the polarity or sentiment of a text is an important task in many application scenarios. Sentiment Analysis of a text can be used to answer various questions such as election prediction, favouredness towards any product etc. But the sentiment analysis task becomes challenging when it comes to low resource languages because the basis of learning sentiment classifiers are annotated datasets and annotated datasets for non-English texts hardly exists. So for the development of sentiment classifiers in Telugu, we have created corpora \"Sentiraama\" for different domains like movie reviews, song lyrics, product reviews and book reviews in Telugu language with the text written in Telugu script. In this paper, we describe the process of creating the corpora and assigning polarities to them. After the creation of corpora, we trained the classifiers that yields good classification results. Typically a sentiment classifier is trained using data from the same domain it is intended to be tested on. But there may not be sufficient data available in the same domain and additionally using data from multiple sources and domains may help in creating a more generalized sentiment classifier which can be applied to multiple domains. So to create this generalized classifier, we used the sentiment data from the above corpus from different domains. We first tested the performance of sentiment analysis models built using single data source for both in-domain and cross-domain classification. Later, we built sentiment model using data samples from multiple domains and then tested the performance of the models based on their classification. Finally, we compared all the three approaches based on the performance of the models and discussed the best approach for sentiment analysis.",{"paper_id":2853,"title":2854,"year":81,"month":855,"day":63,"doi":2855,"resource_url":2856,"first_page":63,"last_page":63,"pdf_url":2857,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2858,"paper_type":860,"authors":2859,"abstract":2866},"lrec2018-main-101","Multilingual Multi-class Sentiment Classification Using Convolutional Neural Networks","10.63317\u002F4suqsh39s5kg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-101","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F149.pdf","attia-etal-2018-multilingual",[2860,2861,2862,2865],{"paper_id":2853,"author_seq":247,"given_name":1166,"surname":1167,"affiliation":63,"orcid":63},{"paper_id":2853,"author_seq":232,"given_name":1160,"surname":1161,"affiliation":63,"orcid":63},{"paper_id":2853,"author_seq":218,"given_name":2863,"surname":2864,"affiliation":63,"orcid":63},"Ali","Elkahky",{"paper_id":2853,"author_seq":203,"given_name":1172,"surname":1173,"affiliation":63,"orcid":63},"This paper describes a language-independent model for multi-class sentiment analysis using a simple neural network architecture of five layers (Embedding, Conv1D, GlobalMaxPooling and two Fully-Connected). The advantage of the proposed model is that it does not rely on language-specific features such as ontologies, dictionaries, or morphological or syntactic pre-processing. Equally important, our system does not use pre-trained word2vec embeddings which can be costly to obtain and train for some languages. In this research, we also demonstrate that oversampling can be an effective approach for correcting class imbalance in the data. We evaluate our methods on three publicly available datasets for English, German and Arabic, and the results show that our system’s performance is comparable to, or even better than, the state of the art for these datasets. We make our source-code publicly available.",{"paper_id":2868,"title":2869,"year":81,"month":855,"day":63,"doi":2870,"resource_url":2871,"first_page":63,"last_page":63,"pdf_url":2872,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2873,"paper_type":860,"authors":2874,"abstract":2884},"lrec2018-main-102","A Large Self-Annotated Corpus for Sarcasm","10.63317\u002F2ifwj74dgdbw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-102","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F160.pdf","khodak-etal-2018-large",[2875,2878,2881],{"paper_id":2868,"author_seq":247,"given_name":2876,"surname":2877,"affiliation":63,"orcid":63},"Mikhail","Khodak",{"paper_id":2868,"author_seq":232,"given_name":2879,"surname":2880,"affiliation":63,"orcid":63},"Nikunj","Saunshi",{"paper_id":2868,"author_seq":218,"given_name":2882,"surname":2883,"affiliation":63,"orcid":63},"Kiran","Vodrahalli","We introduce the Self-Annotated Reddit Corpus (SARC), a large corpus for sarcasm research and for training and evaluating systems for sarcasm detection. The corpus has 1.3 million sarcastic statements --- 10 times more than any previous dataset --- and many times more instances of non-sarcastic statements, allowing for learning in both balanced and unbalanced label regimes. Each statement is furthermore self-annotated --- sarcasm is labeled by the author, not an independent annotator --- and provided with user, topic, and conversation context. We evaluate the corpus for accuracy, construct benchmarks for sarcasm detection, and evaluate baseline methods.",{"paper_id":2886,"title":2887,"year":81,"month":855,"day":63,"doi":2888,"resource_url":2889,"first_page":63,"last_page":63,"pdf_url":2890,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2891,"paper_type":860,"authors":2892,"abstract":2920},"lrec2018-main-103","HappyDB: A Corpus of 100,000 Crowdsourced Happy Moments","10.63317\u002F2vs32uubt6id","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-103","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F204.pdf","asai-etal-2018-happydb",[2893,2896,2898,2901,2904,2906,2909,2912,2915,2918],{"paper_id":2886,"author_seq":247,"given_name":2894,"surname":2895,"affiliation":63,"orcid":63},"Akari","Asai",{"paper_id":2886,"author_seq":232,"given_name":2650,"surname":2897,"affiliation":63,"orcid":63},"Evensen",{"paper_id":2886,"author_seq":218,"given_name":2899,"surname":2900,"affiliation":63,"orcid":63},"Behzad","Golshan",{"paper_id":2886,"author_seq":203,"given_name":2902,"surname":2903,"affiliation":63,"orcid":63},"Alon","Halevy",{"paper_id":2886,"author_seq":188,"given_name":2905,"surname":1591,"affiliation":63,"orcid":63},"Vivian",{"paper_id":2886,"author_seq":172,"given_name":2907,"surname":2908,"affiliation":63,"orcid":63},"Andrei","Lopatenko",{"paper_id":2886,"author_seq":155,"given_name":2910,"surname":2911,"affiliation":63,"orcid":63},"Daniela","Stepanov",{"paper_id":2886,"author_seq":138,"given_name":2913,"surname":2914,"affiliation":63,"orcid":63},"Yoshihiko","Suhara",{"paper_id":2886,"author_seq":121,"given_name":2916,"surname":2917,"affiliation":63,"orcid":63},"Wang-Chiew","Tan",{"paper_id":2886,"author_seq":104,"given_name":2919,"surname":1585,"affiliation":63,"orcid":63},"Yinzhan","The science of happiness is an area of positive psychology concerned with understanding what behaviors make people happy in a sustainable fashion. Recently, there has been interest in developing technologies that help incorporate the findings of the science of happiness into users' daily lives by steering them towards behaviors that increase happiness. With the goal of building technology that can understand how people express their happy moments in text, we crowd-sourced HappyDB, a corpus of 100,000 happy moments that we make publicly available. This paper describes HappyDB and its properties, and outlines several important NLP problems that can be studied with the help of the corpus. We also apply several state-of-the-art analysis techniques to analyze HappyDB. Our results demonstrate the need for deeper NLP techniques to be developed which makes HappyDB an exciting resource for follow-on research.",{"paper_id":2922,"title":2923,"year":81,"month":855,"day":63,"doi":2924,"resource_url":2925,"first_page":63,"last_page":63,"pdf_url":2926,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2927,"paper_type":860,"authors":2928,"abstract":2936},"lrec2018-main-104","MultiBooked: A Corpus of Basque and Catalan Hotel Reviews Annotated for Aspect-level Sentiment Classification","10.63317\u002F3kefhvno4qrm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-104","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F217.pdf","barnes-etal-2018-multibooked",[2929,2931,2934],{"paper_id":2922,"author_seq":247,"given_name":1124,"surname":2930,"affiliation":63,"orcid":63},"Barnes",{"paper_id":2922,"author_seq":232,"given_name":2932,"surname":2933,"affiliation":63,"orcid":63},"Toni","Badia",{"paper_id":2922,"author_seq":218,"given_name":1272,"surname":2935,"affiliation":63,"orcid":63},"Lambert","While sentiment analysis has become an established field in the NLP community, research into languages other than English has been hindered by the lack of resources. Although much research in multi-lingual and cross-lingual sentiment analysis has focused on unsupervised or semi-supervised approaches, these still require a large number of resources and do not reach the performance of supervised approaches. With this in mind, we introduce two datasets for supervised aspect-level sentiment analysis in Basque and Catalan, both of which are under-resourced languages. We provide high-quality annotations and benchmarks with the hope that they will be useful to the growing community of researchers working on these languages.",{"paper_id":2938,"title":2939,"year":81,"month":855,"day":63,"doi":2940,"resource_url":2941,"first_page":63,"last_page":63,"pdf_url":2942,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2943,"paper_type":860,"authors":2944,"abstract":2954},"lrec2018-main-105","BlogSet-BR: A Brazilian Portuguese Blog Corpus","10.63317\u002F34xgyquk7mxj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-105","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F10.pdf","santos-etal-2018-blogset",[2945,2948,2951],{"paper_id":2938,"author_seq":247,"given_name":2946,"surname":2947,"affiliation":63,"orcid":63},"Henrique","Santos",{"paper_id":2938,"author_seq":232,"given_name":2949,"surname":2950,"affiliation":63,"orcid":63},"Vinicius","Woloszyn",{"paper_id":2938,"author_seq":218,"given_name":2952,"surname":2953,"affiliation":63,"orcid":63},"Renata","Vieira","The rich user-generated contend found on internet bloggs have always attracted the interest of scientific communities for many different purposes, such as from opinion and sentiment mining, information extraction or topic discovery. Nonetheless, a extensive corpora is essential to perform most of Natural Language Processing involved in these tasks. This paper presents BlogSet-BR, a extensive Brazilian Portuguese corpus containing 2.1 billions words extracted from 7.4 millions posts over 808 thousand different Brazilian blogs. Additionally, a extensible survey was conducted with authors to draw a profile of Brazilian bloggers.",{"paper_id":2956,"title":2957,"year":81,"month":855,"day":63,"doi":2958,"resource_url":2959,"first_page":63,"last_page":63,"pdf_url":2960,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2961,"paper_type":860,"authors":2962,"abstract":2965},"lrec2018-main-106","SoMeWeTa: A Part-of-Speech Tagger for German Social Media and Web Texts","10.63317\u002F4yrn97cfdc7k","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-106","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F49.pdf","proisl-2018-someweta",[2963],{"paper_id":2956,"author_seq":247,"given_name":1615,"surname":2964,"affiliation":63,"orcid":63},"Proisl","Off-the-shelf part-of-speech taggers typically perform relatively poorly on web and social media texts since those domains are quite different from the newspaper articles on which most tagger models are trained. In this paper, we describe SoMeWeTa, a part-of-speech tagger based on the averaged structured perceptron that is capable of domain adaptation and that can use various external resources. We train the tagger on the German web and social media data of the EmpiriST 2015 shared task. Using the TIGER corpus as background data and adding external information about word classes and Brown clusters, we substantially improve on the state of the art for both the web and the social media data sets. The tagger is available as free software.",{"paper_id":2967,"title":2968,"year":81,"month":855,"day":63,"doi":2969,"resource_url":2970,"first_page":63,"last_page":63,"pdf_url":2971,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2972,"paper_type":860,"authors":2973,"abstract":2986},"lrec2018-main-107","Collecting Code-Switched Data from Social Media","10.63317\u002F2nwjg8pcf7dv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-107","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F92.pdf","mendels-etal-2018-collecting",[2974,2977,2980,2983],{"paper_id":2967,"author_seq":247,"given_name":2975,"surname":2976,"affiliation":63,"orcid":63},"Gideon","Mendels",{"paper_id":2967,"author_seq":232,"given_name":2978,"surname":2979,"affiliation":63,"orcid":63},"Victor","Soto",{"paper_id":2967,"author_seq":218,"given_name":2981,"surname":2982,"affiliation":63,"orcid":63},"Aaron","Jaech",{"paper_id":2967,"author_seq":203,"given_name":2984,"surname":2985,"affiliation":63,"orcid":63},"Julia","Hirschberg","We address the problem of mining code-switched data from the web, where code-switching is defined as the tendency of bilinguals to switch between their multiple languages both across and within utterances.  We propose a method that identifies data as code-switched in languages L1 and L2 when a language classifier labels the document as language L1 but the document also contains words that can only belong to L2. We apply our method to Twitter data and collect a set of more than 43,000 tweets. We obtain language identifiers for a subset of 8,000 tweets using crowd-sourcing with high inter-annotator agreement and accuracy. We validate our Twitter corpus by comparing it to the Spanish-English corpus of code-switched tweets collected for the EMNLP 2016 Shared Task for Language Identification, in terms of code-switching rates, language composition and amount of code-switch types found in both datasets.  We then trained language taggers on both corpora and show that a tagger trained on the EMNLP corpus exhibits a considerable drop in accuracy when tested on the new corpus and a tagger trained on our new corpus achieves very high accuracy when tested on both corpora.",{"paper_id":2988,"title":2989,"year":81,"month":855,"day":63,"doi":2990,"resource_url":2991,"first_page":63,"last_page":63,"pdf_url":2992,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":2993,"paper_type":860,"authors":2994,"abstract":3001},"lrec2018-main-108","Classifying the Informative Behaviour of Emoji in Microblogs","10.63317\u002F484esdnke54c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-108","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F253.pdf","donato-paggio-2018-classifying",[2995,2998],{"paper_id":2988,"author_seq":247,"given_name":2996,"surname":2997,"affiliation":63,"orcid":63},"Giulia","Donato",{"paper_id":2988,"author_seq":232,"given_name":2999,"surname":3000,"affiliation":63,"orcid":63},"Patrizia","Paggio","Emoji are pictographs commonly used in microblogs as emotion markers, but they can also represent a much wider range of concepts. Additionally, they may occur in different positions within a message (e.g. a tweet), appear in sequences or act as word substitute. Emoji must be considered necessary elements in the analysis and processing of user generated content, since they can either provide fundamental syntactic information, emphasize what is already expressed in the text, or carry meaning that cannot be inferred from the words alone. We collected and annotated a corpus of 2475 tweets pairs with the aim of analyzing and then classifying emoji use with respect to redundancy. The best classification model achieved an F-score of 0.7. In this paper we shortly present the corpus, and we describe the classification experiments, explain the predictive features adopted, discuss the problematic aspects of our approach and suggest future improvements.",{"paper_id":3003,"title":3004,"year":81,"month":855,"day":63,"doi":3005,"resource_url":3006,"first_page":63,"last_page":63,"pdf_url":3007,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3008,"paper_type":860,"authors":3009,"abstract":3018},"lrec2018-main-109","A Taxonomy for In-depth Evaluation of Normalization for User Generated Content","10.63317\u002F5cb9nkjw53km","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-109","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F306.pdf","van-der-goot-etal-2018-taxonomy",[3010,3013,3016],{"paper_id":3003,"author_seq":247,"given_name":3011,"surname":3012,"affiliation":63,"orcid":63},"Rob","van der Goot",{"paper_id":3003,"author_seq":232,"given_name":3014,"surname":3015,"affiliation":63,"orcid":63},"Rik","van Noord",{"paper_id":3003,"author_seq":218,"given_name":3017,"surname":3015,"affiliation":63,"orcid":63},"Gertjan","In this work we present a taxonomy of error categories for lexical normalization, which is the task of translating user generated content to canonical language. We annotate a recent normalization dataset to test the practical use of the taxonomy and read a near-perfect agreement. This annotated dataset is then used to evaluate how an existing normalization model performs on the different categories of the taxonomy. The results of this evaluation reveal that some of the problematic categories only include minor transformations, whereas most regular transformations are solved quite well.",{"paper_id":3020,"title":3021,"year":81,"month":855,"day":63,"doi":3022,"resource_url":3023,"first_page":63,"last_page":63,"pdf_url":3024,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3025,"paper_type":860,"authors":3026,"abstract":3032},"lrec2018-main-110","Gaining and Losing Influence in Online Conversation","10.63317\u002F2kq2b2ateein","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-110","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F355.pdf","sharma-strzalkowski-2018-gaining",[3027,3029],{"paper_id":3020,"author_seq":247,"given_name":3028,"surname":1847,"affiliation":63,"orcid":63},"Arun",{"paper_id":3020,"author_seq":232,"given_name":3030,"surname":3031,"affiliation":63,"orcid":63},"Tomek","Strzalkowski","In this paper, we describe a study we conducted to determine, if a person who is highly influential in a discussion on a familiar topic would retain influence when moving to a topic that is less familiar or perhaps not as interesting. For this research, we collected samples of realistic on-line chat room discussions on several topics related to current issues in education, technology, arts, sports, finances, current affairs, etc. The collected data allowed us to create models for specific types of conversational behavior, such as agreement, disagreement, support, persuasion, negotiation, etc. These models were used to study influence in online discussions. It also allowed us to study how human influence works in online discussion and what affects a person's influence from one topic to another. We found that influence is impacted by topic familiarity, sometimes dramatically, and we explain how it is affected and why.",{"paper_id":3034,"title":3035,"year":81,"month":855,"day":63,"doi":3036,"resource_url":3037,"first_page":63,"last_page":63,"pdf_url":3038,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3039,"paper_type":860,"authors":3040,"abstract":3047},"lrec2018-main-111","Arap-Tweet: A Large Multi-Dialect Twitter Corpus for Gender, Age and Language Variety Identification","10.63317\u002F32ycwnmhmp3n","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-111","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F521.pdf","zaghouani-charfi-2018-arap",[3041,3044],{"paper_id":3034,"author_seq":247,"given_name":3042,"surname":3043,"affiliation":63,"orcid":63},"Wajdi","Zaghouani",{"paper_id":3034,"author_seq":232,"given_name":3045,"surname":3046,"affiliation":63,"orcid":63},"Anis","Charfi","In this paper, we present Arap-Tweet, which is a large-scale and multi-dialectal corpus of Tweets from 11 regions and 16 countries in the Arab world representing the major Arabic dialectal varieties. To build this corpus, we collected data from Twitter and we provided a team of experienced annotators with annotation guidelines that they used to annotate the corpus for age categories, gender, and dialectal variety. During the data collection effort, we based our search on distinctive keywords that are specific to the different Arabic dialects and we also validated the location using Twitter API. In this paper, we report on the corpus data collection and annotation efforts. We also present some issues that we encountered during these phases. Then, we present the results of the evaluation performed to ensure the consistency of the annotation. The provided corpus will enrich the limited set of available language resources for Arabic and will be an invaluable enabler for developing author profiling tools and NLP tools for Arabic.",{"paper_id":3049,"title":3050,"year":81,"month":855,"day":63,"doi":3051,"resource_url":3052,"first_page":63,"last_page":63,"pdf_url":3053,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3054,"paper_type":860,"authors":3055,"abstract":3064},"lrec2018-main-112","Transc&Anno: A Graphical Tool for the Transcription and On-the-Fly Annotation of Handwritten Documents","10.63317\u002F34b8vvkzmxy9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-112","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F107.pdf","okinina-etal-2018-transc",[3056,3059,3061],{"paper_id":3049,"author_seq":247,"given_name":3057,"surname":3058,"affiliation":63,"orcid":63},"Nadezda","Okinina",{"paper_id":3049,"author_seq":232,"given_name":3060,"surname":1641,"affiliation":63,"orcid":63},"Lionel",{"paper_id":3049,"author_seq":218,"given_name":3062,"surname":3063,"affiliation":63,"orcid":63},"Verena","Lyding","We present Transc&Anno, a web-based collaboration tool allowing the transcription of text images and their shallow on-the-fly annotation. Transc&Anno was originally developed in order to address the needs of learner corpora research so as to facilitate digitisation of handwritten learner essays. However, the tool can be used for the creation of any type of corpora requiring transcription and shallow on-the-fly annotation resulting in inline XML. Transc&Anno provides an intuitive environment that is explicitly designed to facilitate the transcription and annotation process for linguists. Transc&Anno ensures a high transcription output quality by validating the XML and only allowing predefined tags. It was created on top of the FromThePage transcription tool developed entirely with standard web technologies – Ruby on Rails, Javascript, HTML, and CSS. We adapted this open-source web-based tool to linguistic research purposes by adding linguistic annotation functionalities to it. Thereby we united the convenience of a collaborative transcription tool with its advanced image visualisation, centralised data storage, version control and inter-collaborator communication facilities with the precision of a linguistic annotation tool with its well-developed tag definition possibilities, easy tagging process and tagged-text visualisation. Transc&Anno is easily customisable, open source, and available on Github.",{"paper_id":3066,"title":3067,"year":81,"month":855,"day":63,"doi":3068,"resource_url":3069,"first_page":63,"last_page":63,"pdf_url":3070,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3071,"paper_type":860,"authors":3072,"abstract":3079},"lrec2018-main-113","Correction of OCR Word Segmentation Errors in Articles from the ACL Collection through Neural Machine Translation Methods","10.63317\u002F2gn4wbpg9mkw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-113","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F114.pdf","nastase-hitschler-2018-correction",[3073,3076],{"paper_id":3066,"author_seq":247,"given_name":3074,"surname":3075,"affiliation":63,"orcid":63},"Vivi","Nastase",{"paper_id":3066,"author_seq":232,"given_name":3077,"surname":3078,"affiliation":63,"orcid":63},"Julian","Hitschler","Depending on the quality of the original document, Optical Character Recognition (OCR) can produce a range of errors -- from erroneous letters to additional and spurious blank spaces. We applied a sequence-to-sequence machine translation system to correct word-segmentation OCR errors in scientific texts from the ACL collection with an estimated precision and recall above 0.95 on test data. We present the correction process and results.",{"paper_id":3081,"title":3082,"year":81,"month":855,"day":63,"doi":3083,"resource_url":3084,"first_page":63,"last_page":63,"pdf_url":3085,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3086,"paper_type":860,"authors":3087,"abstract":3089},"lrec2018-main-114","From Manuscripts to Archetypes through Iterative Clustering","10.63317\u002F23ivsw2r2hn5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-114","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F314.pdf","hoenen-2018-manuscripts",[3088],{"paper_id":3081,"author_seq":247,"given_name":2501,"surname":2502,"affiliation":63,"orcid":63},"Corpora of manuscripts of the same ancient text often preserve many variants. This is so because upon copying over long copy chains errors and editorial changes have been repeatedly made and reverted to the effect that most often no 2 variant texts of the same so-called textual tradition have exactly the same text. Obviously in order to save the time to read all of the versions and in order to enable discourse and unambiguous referencing, philologists have since the beginnings of the age of print embarked on providing one single textual representation of this variety. In computational terms one tries to retrieve\u002Fcompose the base text which is most likely the latest common ancestor (archetype) of all observed variants. Computationally, stemmata – that is trees depicting the copy history (manuscripts = nodes, Copy processes = edges) – have been computed and evaluated automatically (Roos and Heikkilä, 2009). Likewise, automatic archetype reconstruction has been introduced lately, (Hoenen, 2015b; Koppel et al., 2016). A synthesis of both stemma generation and archetype reconstruction has not yet been achieved. This paper therefore presents an approach where through iterative clustering a stemma and an archetype text are being reconstructed bottom-up.",{"paper_id":3091,"title":3092,"year":81,"month":855,"day":63,"doi":3093,"resource_url":3094,"first_page":63,"last_page":63,"pdf_url":3095,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3096,"paper_type":860,"authors":3097,"abstract":3105},"lrec2018-main-115","Building A Handwritten Cuneiform Character Imageset","10.63317\u002F4hkn592xca45","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-115","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F374.pdf","yamauchi-etal-2018-building",[3098,3100,3102],{"paper_id":3091,"author_seq":247,"given_name":2474,"surname":3099,"affiliation":63,"orcid":63},"Yamauchi",{"paper_id":3091,"author_seq":232,"given_name":3101,"surname":2272,"affiliation":63,"orcid":63},"Hajime",{"paper_id":3091,"author_seq":218,"given_name":3103,"surname":3104,"affiliation":63,"orcid":63},"Wakaha","Mori","Digitization of cuneiform documents is important to boost the research activity on ancient Middle East and some projects have been launched in around 2,000. However, the digitization process is laborious due to the huge scale of the documents and no trustful (semi- )automatic method has established. In this paper, we focused on a cuneiform document digitization task, realization of Optical Character Recognition (OCR) method from the handwritten copies of original materials. Currently, as the first step toward development of such methods, we are constructing a handwritten cuneiform character imageset named with professional assistance. This imageset contains typical stroke patterns for handwriting each frequently appearing cuneiform character and will be able to support the development of handwritten cuneiform OCR system.",{"paper_id":3107,"title":3108,"year":81,"month":855,"day":63,"doi":3109,"resource_url":3110,"first_page":63,"last_page":63,"pdf_url":3111,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3112,"paper_type":860,"authors":3113,"abstract":3122},"lrec2018-main-116","PDF-to-Text Reanalysis for Linguistic Data Mining","10.63317\u002F4zhr2626de8t","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-116","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F947.pdf","goodman-etal-2018-pdf",[3114,3117,3120],{"paper_id":3107,"author_seq":247,"given_name":3115,"surname":3116,"affiliation":63,"orcid":63},"Michael Wayne","Goodman",{"paper_id":3107,"author_seq":232,"given_name":3118,"surname":3119,"affiliation":63,"orcid":63},"Ryan","Georgi",{"paper_id":3107,"author_seq":218,"given_name":3121,"surname":1386,"affiliation":63,"orcid":63},"Fei","Extracting semi-structured text from scientific writing in PDF files is a difficult task that has faced researchers for decades. In the 1990s, this task was largely a computer vision and OCR problem, as PDF files were often the result of scanning printed documents. Today, PDFs have standardized digital typesetting without the need for OCR, but extraction of semi-structured text from these documents remains a nontrivial task. In this paper, we present a system for the reanalysis of glyph-level PDF extracted text that performs block detection, respacing, and tabular data analysis for the purposes of linguistic data mining. We further present our reanalyzed output format, which attempts to eliminate the extreme verbosity of XML output while leaving important positional information available for downstream processes.",{"paper_id":3124,"title":3125,"year":81,"month":855,"day":63,"doi":3126,"resource_url":3127,"first_page":63,"last_page":63,"pdf_url":3128,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3129,"paper_type":860,"authors":3130,"abstract":3136},"lrec2018-main-117","Crowdsourced Multimodal Corpora Collection Tool","10.63317\u002F55tysysg2bbk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-117","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F9.pdf","jonell-etal-2018-crowdsourced",[3131,3132,3133,3134,3135],{"paper_id":3124,"author_seq":247,"given_name":1272,"surname":1273,"affiliation":63,"orcid":63},{"paper_id":3124,"author_seq":232,"given_name":1275,"surname":1276,"affiliation":63,"orcid":63},{"paper_id":3124,"author_seq":218,"given_name":1263,"surname":1264,"affiliation":63,"orcid":63},{"paper_id":3124,"author_seq":203,"given_name":1278,"surname":1279,"affiliation":63,"orcid":63},{"paper_id":3124,"author_seq":188,"given_name":1284,"surname":1285,"affiliation":63,"orcid":63},"In recent years, more and more multimodal corpora have been created. To our knowledge there is no publicly available tool which allows for acquiring controlled multimodal data of people in a rapid and scalable fashion. We therefore are proposing (1) a novel tool which will enable researchers to rapidly gather large amounts of multimodal data spanning a wide demographic range, and (2) an example of how we used this tool for corpus collection of our ``Attentive listener'' multimodal corpus. The code is released under an Apache License 2.0 and available as an open-source repository, which can be found at \\url{https:\u002F\u002Fgithub.com\u002Fkth-social-robotics\u002Fmultimodal-crowdsourcing-tool}. This tool will allow researchers to set-up their own multimodal data collection system quickly and create their own multimodal corpora. Finally, this paper provides a discussion about the advantages and disadvantages with a crowd-sourced data collection tool, especially in comparison to a lab recorded corpora.",{"paper_id":3138,"title":3139,"year":81,"month":855,"day":63,"doi":3140,"resource_url":3141,"first_page":63,"last_page":63,"pdf_url":3142,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3143,"paper_type":860,"authors":3144,"abstract":3159},"lrec2018-main-118","Expert Evaluation of a Spoken Dialogue System in a Clinical Operating Room","10.63317\u002F4tkqwzuczsc8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-118","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F168.pdf","miehle-etal-2018-expert",[3145,3148,3151,3153,3156,3157],{"paper_id":3138,"author_seq":247,"given_name":3146,"surname":3147,"affiliation":63,"orcid":63},"Juliana","Miehle",{"paper_id":3138,"author_seq":232,"given_name":3149,"surname":3150,"affiliation":63,"orcid":63},"Nadine","Gerstenlauer",{"paper_id":3138,"author_seq":218,"given_name":2554,"surname":3152,"affiliation":63,"orcid":63},"Ostler",{"paper_id":3138,"author_seq":203,"given_name":3154,"surname":3155,"affiliation":63,"orcid":63},"Hubertus","Feußner",{"paper_id":3138,"author_seq":188,"given_name":1251,"surname":1252,"affiliation":63,"orcid":63},{"paper_id":3138,"author_seq":172,"given_name":1471,"surname":3158,"affiliation":63,"orcid":63},"Ultes","With the emergence of new technologies, the surgical working environment becomes increasingly complex and comprises many medical devices which have to be monitored and controlled. With the aim of improving productivity and reducing the workload for the operating staff, we have developed an Intelligent Digital Assistant for Clinical Operating Rooms (IDACO) which allows the surgeon to control the operating room using natural spoken language. As speech is the modality used by the surgeon to communicate with their staff, using it to control the technical devices does not pose an additional mental burden. Therefore, we claim that the surgical environment presents a potential field of application for Spoken Dialogue Systems. In this work, we present the design and implementation of IDACO as well as the evaluation in an experimental set-up by specialists in the field of minimally invasive surgery. Our expert evaluation yields promising results and allows to conclude that clinical operating rooms are indeed an expedient area of application for Spoken Dialogue Systems.",{"paper_id":3161,"title":3162,"year":81,"month":855,"day":63,"doi":3163,"resource_url":3164,"first_page":63,"last_page":63,"pdf_url":3165,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3166,"paper_type":860,"authors":3167,"abstract":3174},"lrec2018-main-119","JAIST Annotated Corpus of Free Conversation","10.63317\u002F3vbmxfpsvfdd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-119","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F179.pdf","shirai-fukuoka-2018-jaist",[3168,3171],{"paper_id":3161,"author_seq":247,"given_name":3169,"surname":3170,"affiliation":63,"orcid":63},"Kiyoaki","Shirai",{"paper_id":3161,"author_seq":232,"given_name":3172,"surname":3173,"affiliation":63,"orcid":63},"Tomotaka","Fukuoka","This paper introduces an annotated corpus of free conversations in Japanese. It is manually annotated with two kinds of linguistic information: dialog act and sympathy. First, each utterance in the free conversation is annotated with its dialog act, which is chosen from a coarse-grained set consisting of nine dialog act labels. Cohen's kappa of the dialog act annotation between two annotators was 0.636. Second, each utterance is judged whether the speaker expresses his\u002Fher sympathy or antipathy toward the other participant or the current topic in the conversation. Cohen's kappa of sympathy tagging was 0.27, indicating the difficulty of the sympathy identification task. As a result, the corpus consists of 92,031 utterances in 97 dialogs. Our corpus is the first annotated corpus of Japanese free conversations that is publicly available.",{"paper_id":3176,"title":3177,"year":81,"month":855,"day":63,"doi":3178,"resource_url":3179,"first_page":63,"last_page":63,"pdf_url":3180,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3181,"paper_type":860,"authors":3182,"abstract":3214},"lrec2018-main-120","The Metalogue Debate Trainee Corpus: Data Collection and Annotations","10.63317\u002F5hnqsunjmvsf","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-120","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F186.pdf","petukhova-etal-2018-metalogue",[3183,3186,3188,3191,3194,3197,3200,3203,3206,3208,3209,3212],{"paper_id":3176,"author_seq":247,"given_name":3184,"surname":3185,"affiliation":63,"orcid":63},"Volha","Petukhova",{"paper_id":3176,"author_seq":232,"given_name":2907,"surname":3187,"affiliation":63,"orcid":63},"Malchanau",{"paper_id":3176,"author_seq":218,"given_name":3189,"surname":3190,"affiliation":63,"orcid":63},"Youssef","Oualil",{"paper_id":3176,"author_seq":203,"given_name":3192,"surname":3193,"affiliation":63,"orcid":63},"Dietrich","Klakow",{"paper_id":3176,"author_seq":188,"given_name":3195,"surname":3196,"affiliation":63,"orcid":63},"Saturnino","Luz",{"paper_id":3176,"author_seq":172,"given_name":3198,"surname":3199,"affiliation":63,"orcid":63},"Fasih","Haider",{"paper_id":3176,"author_seq":155,"given_name":3201,"surname":3202,"affiliation":63,"orcid":63},"Nick","Campbell",{"paper_id":3176,"author_seq":138,"given_name":3204,"surname":3205,"affiliation":63,"orcid":63},"Dimitris","Koryzis",{"paper_id":3176,"author_seq":121,"given_name":3204,"surname":3207,"affiliation":63,"orcid":63},"Spiliotopoulos",{"paper_id":3176,"author_seq":104,"given_name":1388,"surname":1685,"affiliation":63,"orcid":63},{"paper_id":3176,"author_seq":87,"given_name":3210,"surname":3211,"affiliation":63,"orcid":63},"Nicklas","Linz",{"paper_id":3176,"author_seq":73,"given_name":2633,"surname":3213,"affiliation":63,"orcid":63},"Alexandersson","This paper describes the Metalogue Debate Trainee Corpus (DTC). DTC has been collected and annotated in order to facilitate the design of instructional and interactive models for Virtual Debate Coach application - an intelligent tutoring system used by young parliamentarians to train their debate skills. The training is concerned with the use of appropriate multimodal rhetorical devices in order to improve (1) the organization of arguments, (2) arguments' content selection, and (3) argument delivery techniques. DTC contains tracking data from motion and speech capturing devices and semantic annotations - dialogue acts - as defined in ISO 24617-2 and discourse relations as defined in ISO 24617-8. The corpus comes with a manual describing the data collection process, annotation activities including an overview of basic concepts and their definitions including annotation schemes and guidelines on how to apply them, tools and other resources. DTC will be released in the ELRA catalogue in second half of 2018.",{"paper_id":3216,"title":3217,"year":81,"month":855,"day":63,"doi":3218,"resource_url":3219,"first_page":63,"last_page":63,"pdf_url":3220,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3221,"paper_type":860,"authors":3222,"abstract":3228},"lrec2018-main-121","Towards Continuous Dialogue Corpus Creation: writing to corpus and generating from it","10.63317\u002F2bw4597yx2en","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-121","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F188.pdf","malchanau-etal-2018-towards",[3223,3224,3225],{"paper_id":3216,"author_seq":247,"given_name":2907,"surname":3187,"affiliation":63,"orcid":63},{"paper_id":3216,"author_seq":232,"given_name":3184,"surname":3185,"affiliation":63,"orcid":63},{"paper_id":3216,"author_seq":218,"given_name":3226,"surname":3227,"affiliation":63,"orcid":63},"Harry","Bunt","This paper describes a method to create dialogue corpora annotated with interoperable semantic information. The corpus        development is performed following the ISO linguistic annotation framework and primary data encoding initiatives.  The Continuous Dialogue Corpus Creation (D3C) methodology is  proposed,  where  a  corpus is used as a shared repository for analysis and modelling of interactive dialogue behaviour, and for implementation, integration and evaluation of dialogue system  components.  All these activities are supported by the  use of  ISO standard data models including annotation schemes, encoding formats, tools,  and  architectures. Standards also facilitate practical work in dialogue system  implementation,  deployment, evaluation and re-training, and enabling automatic generation of adequate system behaviour from the data. The proposed methodology is applied to the data-driven design of two multimodal interactive applications - the Virtual Negotiation Coach, used for the training of metacognitive skills in a multi-issue bargaining setting, and the Virtual Debate Coach, used for the training of debate skills in political contexts.",{"paper_id":3230,"title":3231,"year":81,"month":855,"day":63,"doi":3232,"resource_url":3233,"first_page":63,"last_page":63,"pdf_url":3234,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3235,"paper_type":860,"authors":3236,"abstract":3240},"lrec2018-main-122","MYCanCor: A Video Corpus of spoken Malaysian Cantonese","10.63317\u002F4rbmaighz498","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-122","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F192.pdf","liesenfeld-2018-mycancor",[3237],{"paper_id":3230,"author_seq":247,"given_name":3238,"surname":3239,"affiliation":63,"orcid":63},"Andreas","Liesenfeld","The Malaysia Cantonese Corpus (MYCanCor) is a collection of recordings of Malaysian Cantonese speech mainly collected in Perak, Malaysia. The corpus consists of around 20 hours of video recordings of spontaneous talk-in-interaction (56 settings) typically involving 2-4 speakers. A short scene description as well as basic speaker information is provided for each recording. The corpus is transcribed in CHAT (minCHAT) format and presented in traditional Chinese characters (UTF8) using the Hong Kong Supplementary Character Set (HKSCS). MYCanCor is expected to be a useful resource for researchers interested in any aspect of spoken language processing or Chinese multimodal corpora.",{"paper_id":3242,"title":3243,"year":81,"month":855,"day":63,"doi":3244,"resource_url":3245,"first_page":63,"last_page":63,"pdf_url":3246,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3247,"paper_type":860,"authors":3248,"abstract":3256},"lrec2018-main-123","KTH Tangrams: A Dataset for Research on Alignment and Conceptual Pacts in Task-Oriented Dialogue","10.63317\u002F2o4ybobztjsz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-123","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F267.pdf","shore-etal-2018-kth",[3249,3252,3255],{"paper_id":3242,"author_seq":247,"given_name":3250,"surname":3251,"affiliation":63,"orcid":63},"Todd","Shore",{"paper_id":3242,"author_seq":232,"given_name":3253,"surname":3254,"affiliation":63,"orcid":63},"Theofronia","Androulakaki",{"paper_id":3242,"author_seq":218,"given_name":1281,"surname":1282,"affiliation":63,"orcid":63},"There is a growing body of research focused on task-oriented instructor-manipulator dialogue, whereby one dialogue participant initiates a reference to an entity in a common environment while the other participant must resolve this reference in order to manipulate said entity. Many of these works are based on disparate if nevertheless similar datasets. This paper described an English corpus of referring expressions in relatively free, unrestricted dialogue with physical features generated in a simulation, which facilitate analysis of dialogic linguistic phenomena regarding alignment in the formation of referring expressions known as conceptual pacts.",{"paper_id":3258,"title":3259,"year":81,"month":855,"day":63,"doi":3260,"resource_url":3261,"first_page":63,"last_page":63,"pdf_url":3262,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3263,"paper_type":860,"authors":3264,"abstract":3273},"lrec2018-main-124","On the Vector Representation of Utterances in Dialogue Context","10.63317\u002F4gnmycmg3hnk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-124","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F305.pdf","pragst-etal-2018-vector",[3265,3268,3271,3272],{"paper_id":3258,"author_seq":247,"given_name":3266,"surname":3267,"affiliation":63,"orcid":63},"Louisa","Pragst",{"paper_id":3258,"author_seq":232,"given_name":3269,"surname":3270,"affiliation":63,"orcid":63},"Niklas","Rach",{"paper_id":3258,"author_seq":218,"given_name":1251,"surname":1252,"affiliation":63,"orcid":63},{"paper_id":3258,"author_seq":203,"given_name":1471,"surname":3158,"affiliation":63,"orcid":63},"In recent years, the representation of words as vectors in a vector space, also known as word embeddings, has achieved a high degree of attention in the research community and the benefits of such a representation can be seen in the numerous applications that utilise it. In this work, we introduce dialogue vector models, a new language resource that represents dialogue utterances in vector space and captures the semantic meaning of those utterances in the dialogue context. We examine how the word vector approach can be applied to utterances in a dialogue to generate a meaningful representation of them in vector space. Utilising existing dialogue corpora and word vector models, we create dialogue vector models and show that they capture relevant semantic information by comparing them to manually annotated dialogue acts. Furthermore, we discuss potential areas of application for dialogue vector models, such as dialogue act annotation, learning of dialogue strategies, intent detection and paraphrasing.",{"paper_id":3275,"title":3276,"year":81,"month":855,"day":63,"doi":3277,"resource_url":3278,"first_page":63,"last_page":63,"pdf_url":3279,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3280,"paper_type":860,"authors":3281,"abstract":3290},"lrec2018-main-125","ES-Port: a Spontaneous Spoken Human-Human Technical Support Corpus for Dialogue Research in Spanish","10.63317\u002F4jcxiad655y4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-125","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F322.pdf","garcia-sardina-etal-2018-es",[3282,3284,3287],{"paper_id":3275,"author_seq":247,"given_name":1172,"surname":3283,"affiliation":63,"orcid":63},"García-Sardiña",{"paper_id":3275,"author_seq":232,"given_name":3285,"surname":3286,"affiliation":63,"orcid":63},"Manex","Serras",{"paper_id":3275,"author_seq":218,"given_name":3288,"surname":3289,"affiliation":63,"orcid":63},"Arantza","del Pozo","In this paper the ES-Port corpus is presented. ES-Port is a spontaneous spoken human-human dialogue corpus in Spanish that consists of 1170 dialogues from calls to the technical support department of a telecommunications provider. This paper describes its compilation process, from the transcription of the raw audio to the anonymisation of the sensitive data contained in the transcriptions. Because the anonymisation process was carried out through substitution by entities of the same type, coherence and readability are kept within the anonymised dialogues. In the resulting corpus, the replacements of the anonymised entities are labelled with their corresponding categories. In addition, the corpus is annotated with acoustic-related extralinguistic events such as background noise or laughter and linguistic phenomena such as false starts, use of filler words or code switching. The ES-Port corpus is now publicly available through the META-SHARE repository, with the main objective of promoting further research into more open domain data-driven dialogue systems in Spanish.",{"paper_id":3292,"title":3293,"year":81,"month":855,"day":63,"doi":3294,"resource_url":3295,"first_page":63,"last_page":63,"pdf_url":3296,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3297,"paper_type":860,"authors":3298,"abstract":3304},"lrec2018-main-126","From analysis to modeling of engagement as sequences of multimodal behaviors","10.63317\u002F2iastdvgzhwz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-126","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F456.pdf","dermouche-pelachaud-2018-analysis",[3299,3302],{"paper_id":3292,"author_seq":247,"given_name":3300,"surname":3301,"affiliation":63,"orcid":63},"Soumia","Dermouche",{"paper_id":3292,"author_seq":232,"given_name":1468,"surname":3303,"affiliation":63,"orcid":63},"Pelachaud","In this paper, we present an approach to endow an Embodied Conversational Agent with engagement capabilities. We relied on a corpus of expert-novice interactions. Two types of manual annotation were conducted: non-verbal signals such as gestures, head movements and smiles; engagement level of both expert and novice during the interaction. Then, we used a temporal sequence mining algorithm to extract non-verbal sequences eliciting variation of engagement perception. Our aim is to apply these findings in human-agent interaction  to analyze user's engagement level and to control agent's behavior. The novelty of this study is to consider explicitly engagement as sequence of multimodal behaviors.",{"paper_id":3306,"title":3307,"year":81,"month":855,"day":63,"doi":3308,"resource_url":3309,"first_page":63,"last_page":63,"pdf_url":3310,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3311,"paper_type":860,"authors":3312,"abstract":3316},"lrec2018-main-127","A corpus of German political speeches from the 21st century","10.63317\u002F2uc8iqek8out","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-127","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F324.pdf","barbaresi-2018-corpus",[3313],{"paper_id":3306,"author_seq":247,"given_name":3314,"surname":3315,"affiliation":63,"orcid":63},"Adrien","Barbaresi","The present German political speeches corpus follows from a initial release which has been used in various research contexts. This article documents an updated and extended version: as 2017 marks the end of a legislative period, the corpus now includes the four highest ranked functions on federal state level. Besides providing a citable reference for this resource, the main contributions are (1) an extensive description of the corpus to be released and (2) the description of an interface to navigate through the texts, designed for researchers beyond the corpus and computational linguistics communities as well as for the general public. The corpus can be considered to be from the 21st century since most speeches have been written after 2001 and also because it includes a visualization interface providing synoptic overviews ordered chronologically, by speaker or by keyword as well as consequent accesses to the texts.",{"paper_id":3318,"title":3319,"year":81,"month":855,"day":63,"doi":3320,"resource_url":3321,"first_page":63,"last_page":63,"pdf_url":3322,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3323,"paper_type":860,"authors":3324,"abstract":3330},"lrec2018-main-128","Building Literary Corpora for Computational Literary Analysis - A Prototype to Bridge the Gap between CL and DH","10.63317\u002F5gwm3duxr23a","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-128","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F371.pdf","frank-ivanovic-2018-building",[3325,3327],{"paper_id":3318,"author_seq":247,"given_name":1731,"surname":3326,"affiliation":63,"orcid":63},"Frank",{"paper_id":3318,"author_seq":232,"given_name":3328,"surname":3329,"affiliation":63,"orcid":63},"Christine","Ivanovic","The design of LitText follows the traditional research approach in digital humanities (DH): collecting texts for critical reading and underlining parts of interest. Texts, in multiple languages, are prepared with a minimal markup language, and processed by NLP services. The result is converted to RDF (a.k.a. semantic-web, linked-data) triples. Additional data available as linked data on the web (e.g. Wikipedia data) can be added. The DH researcher can then harvest the corpus with SPARQL queries. The approach is demonstrated with the construction of a 20 million word corpus from English, German, Spanish, French and Italian texts and an example query to identify texts where animals behave like humans as it is the case in fables.",{"paper_id":3332,"title":3333,"year":81,"month":855,"day":63,"doi":3334,"resource_url":3335,"first_page":63,"last_page":63,"pdf_url":3336,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3337,"paper_type":860,"authors":3338,"abstract":3347},"lrec2018-main-129","Towards faithfully visualizing global linguistic diversity","10.63317\u002F2ss7hq737vrf","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-129","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F813.pdf","mcnew-etal-2018-towards",[3339,3342,3345],{"paper_id":3332,"author_seq":247,"given_name":3340,"surname":3341,"affiliation":63,"orcid":63},"Garland","McNew",{"paper_id":3332,"author_seq":232,"given_name":3343,"surname":3344,"affiliation":63,"orcid":63},"Curdin","Derungs",{"paper_id":3332,"author_seq":218,"given_name":1085,"surname":3346,"affiliation":63,"orcid":63},"Moran","The most popular strategy for visualizing worldwide linguistic diversity is to utilize point symbology by plotting linguistic features as colored dots or shapes on a Mercator map projection. This approach creates illusions due to the choice of cartographic projection and also from statistical biases inherent in samples of language data and  their encoding in typological databases. Here we describe these challenges and offer an approach towards faithfully visualizing linguistic diversity. Instead of Mercator, we propose an Eckert IV projection to serve as a map base layer. Instead of languages-as-points, we use Voronoi\u002FThiessen tessellations to model linguistic areas, including polygons for languages for which there is missing data in the sample under investigation. Lastly we discuss future work in the intersection of cartography and comparative linguistics, which must be addressed to further advance visualizations of worldwide linguistic diversity.",{"paper_id":3349,"title":3350,"year":81,"month":855,"day":63,"doi":3351,"resource_url":3352,"first_page":63,"last_page":63,"pdf_url":3353,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3354,"paper_type":860,"authors":3355,"abstract":3360},"lrec2018-main-130","The GermaParl Corpus of Parliamentary Protocols","10.63317\u002F5p8afoeiofoc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-130","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1024.pdf","blatte-blessing-2018-germaparl",[3356,3358],{"paper_id":3349,"author_seq":247,"given_name":3238,"surname":3357,"affiliation":63,"orcid":63},"Blätte",{"paper_id":3349,"author_seq":232,"given_name":1477,"surname":3359,"affiliation":63,"orcid":63},"Blessing","This paper introduces the GermaParl Corpus. We outline available data, the data preparation process for preparing corpora of parliamentary debates, and the tools we used to obtained hand-coded annotations that serve as training data for classifying debates. Beyond introducing a resource that is valuable for research, we share experiences and best practices for preparing corpora of plenary protocols.",{"paper_id":3362,"title":3363,"year":81,"month":855,"day":63,"doi":3364,"resource_url":3365,"first_page":63,"last_page":63,"pdf_url":3366,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3367,"paper_type":860,"authors":3368,"abstract":3387},"lrec2018-main-131","Identifying Speakers and Addressees in Dialogues Extracted from Literary Fiction","10.63317\u002F3fbftv65cwoy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-131","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1036.pdf","ek-etal-2018-identifying",[3369,3372,3375,3378,3381,3384],{"paper_id":3362,"author_seq":247,"given_name":3370,"surname":3371,"affiliation":63,"orcid":63},"Adam","Ek",{"paper_id":3362,"author_seq":232,"given_name":3373,"surname":3374,"affiliation":63,"orcid":63},"Mats","Wirén",{"paper_id":3362,"author_seq":218,"given_name":3376,"surname":3377,"affiliation":63,"orcid":63},"Robert","Östling",{"paper_id":3362,"author_seq":203,"given_name":3379,"surname":3380,"affiliation":63,"orcid":63},"Kristina","N. Björkenstam",{"paper_id":3362,"author_seq":188,"given_name":3382,"surname":3383,"affiliation":63,"orcid":63},"Gintarė","Grigonytė",{"paper_id":3362,"author_seq":172,"given_name":3385,"surname":3386,"affiliation":63,"orcid":63},"Sofia","Gustafson Capková","This paper describes an approach to identifying speakers and addressees in dialogues extracted from literary fiction, along with a dataset annotated for speaker and addressee. The overall purpose of this is to provide annotation of dialogue interaction between characters in literary corpora in order to allow for enriched search facilities and construction of social networks from the corpora. To predict speakers and addressees in a dialogue, we use a sequence labeling approach applied to a given set of characters. We use features relating to the current dialogue, the preceding narrative, and the complete preceding context. The results indicate that even with a small amount of training data, it is possible to build a fairly accurate classifier for speaker and addressee identification across different authors, though the identification of addressees is the more difficult task.",{"paper_id":3389,"title":3390,"year":81,"month":855,"day":63,"doi":3391,"resource_url":3392,"first_page":63,"last_page":63,"pdf_url":3393,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3394,"paper_type":860,"authors":3395,"abstract":3401},"lrec2018-main-132","Word Embedding Evaluation Datasets and Wikipedia Title Embedding for Chinese","10.63317\u002F2yv88wsv3hea","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-132","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F159.pdf","chen-ma-2018-word",[3396,3398],{"paper_id":3389,"author_seq":247,"given_name":3397,"surname":2401,"affiliation":63,"orcid":63},"Chi-Yen",{"paper_id":3389,"author_seq":232,"given_name":3399,"surname":3400,"affiliation":63,"orcid":63},"Wei-Yun","Ma","Distributed word representations are widely used in many NLP tasks, and  there are lots of benchmarks to evaluate word embeddings in English. However there are barely evaluation sets with large enough amount of data for Chinese word embeddings. Therefore, in this paper, we create several evaluation sets for Chinese word embedding on both word similarity task and analogical task via translating some existing popular evaluation sets from English to Chinese. To assess the quality of translated datasets, we obtain human rating from both experts and Amazon Mechanical Turk workers. While translating the datasets, we find out that around 30 percents of word pairs in the benchmarks are Wikipedia titles. This motivate us to evaluate the performance of Wikipedia title embeddings on our new benchmarks. Thus, in this paper, not only the new benchmarks are tested but some new improved approaches of Wikipedia title embeddings are proposed. We perform training of embeddings of Wikipedia titles using not only their Wikipedia context but also their Wikipedia categories, most of categories are noun phrases, and we identify the head words of the noun phrases by a parser for further emphasizing their roles on the training of title embeddings. Experimental results and the comprehensive error analysis demonstrate that the benchmarks can precisely reflect the approaches' quality, and the effectiveness of our improved approaches on Wikipedia title embeddings are also verified and analyzed in detail.",{"paper_id":3403,"title":3404,"year":81,"month":855,"day":63,"doi":3405,"resource_url":3406,"first_page":63,"last_page":63,"pdf_url":3407,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3408,"paper_type":860,"authors":3409,"abstract":3416},"lrec2018-main-133","An Automatic Learning of an Algerian Dialect Lexicon by using Multilingual Word Embeddings","10.63317\u002F3jk75ra4oo9c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-133","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F185.pdf","karima-smaili-2018-automatic",[3410,3413],{"paper_id":3403,"author_seq":247,"given_name":3411,"surname":3412,"affiliation":63,"orcid":63},"Abidi","Karima",{"paper_id":3403,"author_seq":232,"given_name":3414,"surname":3415,"affiliation":63,"orcid":63},"Kamel","Smaïli","The goal of this work consists in building automatically from a social network (Youtube) an Algerian dialect lexicon. Each entry of this lexicon is composed by a word, written in Arabic script (modern standard Arabic or dialect) or Latin script (Arabizi, French or English). To each word, several transliterations are proposed, written in a script different from the one used for the word itself. To do that, we harvested and aligned an Algerian dialect corpus by using an iterative method based on multlingual word embeddings representation. The multlinguality in the corpus is due to the fact that Algerian people use several languages to post comments in social networks: Modern Standard Arabic (MSA), Algerian dialect, French and sometimes English. In addition, the users of social networks write freely without any regard to the grammar of these languages. We tested the proposed method on a test lexicon, it leads to a score of 73% in terms of F-measure.",{"paper_id":3418,"title":3419,"year":81,"month":855,"day":63,"doi":3420,"resource_url":3421,"first_page":63,"last_page":63,"pdf_url":3422,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3423,"paper_type":860,"authors":3424,"abstract":3433},"lrec2018-main-134","Candidate Ranking for Maintenance of an Online Dictionary","10.63317\u002F2nvn3ihrgqho","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-134","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F222.pdf","broad-etal-2018-candidate",[3425,3427,3430],{"paper_id":3418,"author_seq":247,"given_name":1217,"surname":3426,"affiliation":63,"orcid":63},"Broad",{"paper_id":3418,"author_seq":232,"given_name":3428,"surname":3429,"affiliation":63,"orcid":63},"Helen","Langone",{"paper_id":3418,"author_seq":218,"given_name":3431,"surname":3432,"affiliation":63,"orcid":63},"David Guy","Brizan","Traditionally, the process whereby a lexicographer identifies a lexical item to add to a dictionary -- a database of lexical items -- has been time-consuming and subjective. In the modern age of online dictionaries, all queries for lexical entries not currently in the database are indistinguishable from a larger list of misspellings, meaning that potential new or trending entries can get lost easily. In this project, we develop a system that uses machine learning techniques to assign these ``misspells'' a probability of being a novel or missing entry, incorporating signals from orthography, usage by trusted online sources, and dictionary query patterns.",{"paper_id":3435,"title":3436,"year":81,"month":855,"day":63,"doi":3437,"resource_url":3438,"first_page":63,"last_page":63,"pdf_url":3439,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3440,"paper_type":860,"authors":3441,"abstract":3445},"lrec2018-main-135","Language adaptation experiments via cross-lingual embeddings for related languages","10.63317\u002F2jv83scsdjqj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-135","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F227.pdf","sharoff-2018-language",[3442],{"paper_id":3435,"author_seq":247,"given_name":3443,"surname":3444,"affiliation":63,"orcid":63},"Serge","Sharoff","Language Adaptation (similarly to Domain Adaptation) is a general approach to extend existing resources from a better resourced language (donor) to a lesser resourced one (recipient) by exploiting the lexical and grammatical similarity between them when the two languages are related. The current study improves the state of the art in cross-lingual word embeddings by considering the impact of orthographic similarity between cognates.  In particular, the use of the Weighted Levenshtein Distance combined with orthogonalisation of the translation matrix and generalised correction for hubness can considerably improve the state of the art in induction of bilingual lexicons.  In addition to intrinsic evaluation in the bilingual lexicon induction task, the paper reports extrinsic evaluation of the cross-lingual embeddings via their application to the Named-Entity Recognition task across Slavonic languages. The tools and the aligned word embedding spaces for the Romance and Slavonic language families have been released.",{"paper_id":3447,"title":3448,"year":81,"month":855,"day":63,"doi":3449,"resource_url":3450,"first_page":63,"last_page":63,"pdf_url":3451,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3452,"paper_type":860,"authors":3453,"abstract":3463},"lrec2018-main-136","Tools for Building an Interlinked Synonym Lexicon Network","10.63317\u002F4b84hhhpu3zn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-136","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F232.pdf","uresova-etal-2018-tools",[3454,3457,3459,3461],{"paper_id":3447,"author_seq":247,"given_name":3455,"surname":3456,"affiliation":63,"orcid":63},"Zdeňka","Urešová",{"paper_id":3447,"author_seq":232,"given_name":890,"surname":3458,"affiliation":63,"orcid":63},"Fučíková",{"paper_id":3447,"author_seq":218,"given_name":890,"surname":3460,"affiliation":63,"orcid":63},"Hajičová",{"paper_id":3447,"author_seq":203,"given_name":2633,"surname":3462,"affiliation":63,"orcid":63},"Hajič","This paper presents the structure, features and design of a new interlinked verbal synonym lexical resource called CzEngClass and the editor tool being developed to assist the work. This lexicon captures cross-lingual (Czech and English) synonyms, using valency behavior of synonymous verbs in relation to semantic roles as one of the criteria for defining such interlingual synonymy. The tool, called Synonym Class Editor - SynEd, is a user-friendly tool specifically customized to build and edit individual entries in the lexicon. It helps to keep the cross-lingual synonym classes consistent and linked to internal as well as to well-known external lexical resources. The structure of SynEd also allows to keep and edit the appropriate syntactic and semantic information for each Synonym Class member. The editor makes it possible to display examples of class members’ usage in translational context in a parallel corpus. SynEd is platform independent and may be used for multiple languages. SynEd, CzEngClass and services based on them will be openly available.",{"paper_id":3465,"title":3466,"year":81,"month":855,"day":63,"doi":3467,"resource_url":3468,"first_page":63,"last_page":63,"pdf_url":3469,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3470,"paper_type":860,"authors":3471,"abstract":3475},"lrec2018-main-137","Very Large-Scale Lexical Resources to Enhance Chinese and Japanese Machine Translation","10.63317\u002F3mkym547d7ba","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-137","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F246.pdf","halpern-2018-large",[3472],{"paper_id":3465,"author_seq":247,"given_name":3473,"surname":3474,"affiliation":63,"orcid":63},"Jack","Halpern","A major issue in machine translation (MT) applications is the recognition and translation of named entities. This is especially true for Chinese and Japanese, whose scripts present linguistic and algorithmic challenges not found in other languages. This paper discusses some of the major issues in Japanese and Chinese MT, such as the difficulties of translating proper nouns and technical terms, and the complexities of orthographic variation in Japanese. Of special interest are neural machine translation (NMT) systems, which suffer from a serious out-of-vocabulary problem. However, the current architecture of these systems makes it technically challenging for them to alleviate this problem by supporting lexicons. This paper introduces some Very Large-Scale Lexical Resources (VLSLR) consisting of millions of named entities, and argues that the quality of MT in general, and NMT systems in particular, can be significantly enhanced through the integration of lexicons.",{"paper_id":3477,"title":3478,"year":81,"month":855,"day":63,"doi":3479,"resource_url":3480,"first_page":63,"last_page":63,"pdf_url":3481,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3482,"paper_type":860,"authors":3483,"abstract":3492},"lrec2018-main-138","Combining Concepts and Their Translations from Structured Dictionaries of Uralic Minority Languages","10.63317\u002F3fsor5qcrvnx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-138","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F364.pdf","hamalainen-etal-2018-combining",[3484,3487,3490],{"paper_id":3477,"author_seq":247,"given_name":3485,"surname":3486,"affiliation":63,"orcid":63},"Mika","Hämäläinen",{"paper_id":3477,"author_seq":232,"given_name":3488,"surname":3489,"affiliation":63,"orcid":63},"Liisa Lotta","Tarvainen",{"paper_id":3477,"author_seq":218,"given_name":3473,"surname":3491,"affiliation":63,"orcid":63},"Rueter","Building language resources for endangered languages, especially in the case of dictionaries, requires a substantial amount of manual work. This, however, is a time-consuming undertaking, and it is also why we propose an automated method for expanding the knowledge in the existing dictionaries. In this paper, we present an approach to automatically combine conceptually divided translations from multilingual dictionaries for small Uralic languages. This is done for the noun dictionaries of Skolt Sami, Erzya, Moksha and Komi- Zyrian in such a way that the combined translations are included in the dictionaries of each language and then evaluated by professional linguists fluent in these languages. Inclusion of the method as a part of the new crowdsourced MediaWiki based pipeline for editing the dictionaries is discussed. The method can be used there not only to expand the existing dictionaries but also to provide the editors with translations when they are adding a new lexical entry to the system.",{"paper_id":3494,"title":3495,"year":81,"month":855,"day":63,"doi":3496,"resource_url":3497,"first_page":63,"last_page":63,"pdf_url":3498,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3499,"paper_type":860,"authors":3500,"abstract":3512},"lrec2018-main-139","Transfer of Frames from English FrameNet to Construct Chinese FrameNet: A Bilingual Corpus-Based Approach","10.63317\u002F58vv9pbm8ze4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-139","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F377.pdf","yang-etal-2018-transfer",[3501,3504,3507,3510],{"paper_id":3494,"author_seq":247,"given_name":3502,"surname":3503,"affiliation":63,"orcid":63},"Tsung-Han","Yang",{"paper_id":3494,"author_seq":232,"given_name":3505,"surname":3506,"affiliation":63,"orcid":63},"Hen-Hsen","Huang",{"paper_id":3494,"author_seq":218,"given_name":3508,"surname":3509,"affiliation":63,"orcid":63},"An-Zi","Yen",{"paper_id":3494,"author_seq":203,"given_name":3511,"surname":2401,"affiliation":63,"orcid":63},"Hsin-Hsi","Current publicly available Chinese FrameNet has a relatively low coverage of frames and lexical units compared with FrameNet in other languages. Frames are incompletely specified for some lexical units, and some critical lexical elements are even missing. That results in suitable frames cannot be triggered and filled in practical applications. This paper presents an automatic approach to constructing Chinese FrameNet. We first capture the mapping between English lexical entries and their Chinese counterparts in a large scale sentence-aligned English-Chinese bilingual corpus. Then, a semantic transfer approach is proposed based on word alignments applied to a large balanced bilingual corpus. The resource currently covers 779 frames and 36k lexical units.",{"paper_id":3514,"title":3515,"year":81,"month":855,"day":63,"doi":3516,"resource_url":3517,"first_page":63,"last_page":63,"pdf_url":3518,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3519,"paper_type":860,"authors":3520,"abstract":3526},"lrec2018-main-140","EFLLex: A Graded Lexical Resource for Learners of English as a Foreign Language","10.63317\u002F43e9y9se8gyj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-140","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F439.pdf","durlich-francois-2018-efllex",[3521,3524],{"paper_id":3514,"author_seq":247,"given_name":3522,"surname":3523,"affiliation":63,"orcid":63},"Luise","Dürlich",{"paper_id":3514,"author_seq":232,"given_name":1615,"surname":3525,"affiliation":63,"orcid":63},"François","This paper introduces EFLLex, an innovative lexical resource that describes the use of 15,280 English words in pedagogical materials across the proficiency levels of the European Framework of Reference for Languages. The methodology adopted to produce the resource implies the selection of an efficient part-of-speech tagger, the use of a robust estimator for frequency computation and some manual post-editing work. The content of the resource is described and compared to other vocabulary lists (MRC and BNC) and to a reference pedagogical resource: the English Vocabulary Profile.",{"paper_id":3528,"title":3529,"year":81,"month":855,"day":63,"doi":3530,"resource_url":3531,"first_page":63,"last_page":63,"pdf_url":3532,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3533,"paper_type":860,"authors":3534,"abstract":3547},"lrec2018-main-141","English-Basque Statistical and Neural Machine Translation","10.63317\u002F2qaifpjgg3vf","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-141","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F101.pdf","jauregi-unanue-etal-2018-english",[3535,3538,3541,3544],{"paper_id":3528,"author_seq":247,"given_name":3536,"surname":3537,"affiliation":63,"orcid":63},"Inigo","Jauregi Unanue",{"paper_id":3528,"author_seq":232,"given_name":3539,"surname":3540,"affiliation":63,"orcid":63},"Lierni","Garmendia Arratibel",{"paper_id":3528,"author_seq":218,"given_name":3542,"surname":3543,"affiliation":63,"orcid":63},"Ehsan","Zare Borzeshi",{"paper_id":3528,"author_seq":203,"given_name":3545,"surname":3546,"affiliation":63,"orcid":63},"Massimo","Piccardi","Neural Machine Translation (NMT) has attracted increasing attention in the recent years. However, it tends to require very large training corpora which could prove problematic for languages with low resources. For this reason, Statistical Machine Translation (SMT) continues to be a popular approach for low-resource language pairs. In this work, we address English-Basque translation and compare the performance of three contemporary statistical and neural machine translation systems: OpenNMT, Moses SMT and Google Translate. For evaluation, we employ an open-domain and an IT-domain corpora from the WMT16 resources for machine translation. In addition, we release a small dataset (Berriak) of 500 highly-accurate English-Basque translations of complex sentences useful for a thorough testing of the translation systems.",{"paper_id":3549,"title":3550,"year":81,"month":855,"day":63,"doi":3551,"resource_url":3552,"first_page":63,"last_page":63,"pdf_url":3553,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3554,"paper_type":860,"authors":3555,"abstract":3568},"lrec2018-main-142","TQ-AutoTest – An Automated Test Suite for (Machine) Translation Quality","10.63317\u002F2mak5p4vdhbt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-142","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F121.pdf","macketanz-etal-2018-tq",[3556,3559,3562,3565],{"paper_id":3549,"author_seq":247,"given_name":3557,"surname":3558,"affiliation":63,"orcid":63},"Vivien","Macketanz",{"paper_id":3549,"author_seq":232,"given_name":3560,"surname":3561,"affiliation":63,"orcid":63},"Renlong","Ai",{"paper_id":3549,"author_seq":218,"given_name":3563,"surname":3564,"affiliation":63,"orcid":63},"Aljoscha","Burchardt",{"paper_id":3549,"author_seq":203,"given_name":3566,"surname":3567,"affiliation":63,"orcid":63},"Hans","Uszkoreit","In several areas of NLP evaluation, test suites have been used to analyze the strengths and weaknesses of systems. Today, Machine Translation (MT) quality is usually assessed by shallow automatic comparisons of MT outputs with reference corpora resulting in a number. Especially the trend towards neural MT has renewed peoples’ interest in better and more analytical diagnostic methods for MT quality. In this paper we present TQ-AutoTest, a novel framework that supports a linguistic evaluation of (machine) translations using test suites. Our current test suites comprise about 5000 handcrafted test items for the language pair German–English. The framework supports the creation of tests and the semi-automatic evaluation of the MT results using regular expressions. The expressions help to classify the results as correct, incorrect or as requiring a manual check. The approach can easily be extended to other NLP tasks where test suites can be used such as evaluating (one-shot) dialogue systems.",{"paper_id":3570,"title":3571,"year":81,"month":855,"day":63,"doi":3572,"resource_url":3573,"first_page":63,"last_page":63,"pdf_url":3574,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3575,"paper_type":860,"authors":3576,"abstract":3584},"lrec2018-main-143","Exploiting Pre-Ordering for Neural Machine Translation","10.63317\u002F2eq95zrxctxs","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-143","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F129.pdf","zhao-etal-2018-exploiting",[3577,3579,3581],{"paper_id":3570,"author_seq":247,"given_name":3503,"surname":3578,"affiliation":63,"orcid":63},"Zhao",{"paper_id":3570,"author_seq":232,"given_name":3580,"surname":2381,"affiliation":63,"orcid":63},"Jiajun",{"paper_id":3570,"author_seq":218,"given_name":3582,"surname":3583,"affiliation":63,"orcid":63},"Chengqing","Zong","Neural Machine Translation (NMT) has drawn much attention due to its promising translation performance in recent years. However, the under-translation and over-translation problem still remain a big challenge. Through error analysis, we find that under-translation is much more prevalent than over-translation and the source words that need to be reordered during translation are more likely to be ignored. To address the under-translation problem, we explore the pre-ordering approach for NMT. Specifically, we pre-order the source sentences to approximate the target language word order. We then combine the pre-ordering model with position embedding to enhance the monotone translation. Finally, we augment our model with the coverage mechanism to tackle the over-translation problem. Experimental results on Chinese-to-English translation have shown that our method can significantly improve the translation quality by up to 2.43 BLEU points. Furthermore, the detailed analysis demonstrates that our approach can substantially reduce the number of under-translation cases by 30.4% (compared to 17.4% using the coverage model).",{"paper_id":3586,"title":3587,"year":81,"month":855,"day":63,"doi":3588,"resource_url":3589,"first_page":63,"last_page":63,"pdf_url":3590,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3591,"paper_type":860,"authors":3592,"abstract":3600},"lrec2018-main-144","Improving a Multi-Source Neural Machine Translation Model with Corpus Extension for Low-Resource Languages","10.63317\u002F4oacxv7nncx9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-144","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F139.pdf","choi-etal-2018-improving",[3593,3595,3598],{"paper_id":3586,"author_seq":247,"given_name":3594,"surname":1110,"affiliation":63,"orcid":63},"Gyu-Hyeon",{"paper_id":3586,"author_seq":232,"given_name":3596,"surname":3597,"affiliation":63,"orcid":63},"Jong-Hun","Shin",{"paper_id":3586,"author_seq":218,"given_name":3599,"surname":1104,"affiliation":63,"orcid":63},"Young-Kil","In machine translation, we often try to collect resources to improve performance. However, most of the language pairs, such as Korean-Arabic and Korean-Vietnamese, do not have enough resources to train machine translation systems. In this paper, we propose the use of synthetic methods for extending a low-resource corpus and apply it to a multi-source neural machine translation model. We showed the improvement of machine translation performance through corpus extension using the synthetic method. We specifically focused on how to create source sentences that can make better target sentences, including the use of synthetic methods. We found that the corpus extension could also improve the performance of multi-source neural machine translation. We showed the corpus extension and multi-source model to be efficient methods for a low-resource language pair. Furthermore, when both methods were used together, we found better machine translation performance.",{"paper_id":3602,"title":3603,"year":81,"month":855,"day":63,"doi":3604,"resource_url":3605,"first_page":63,"last_page":63,"pdf_url":3606,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3607,"paper_type":860,"authors":3608,"abstract":3620},"lrec2018-main-145","Dynamic Oracle for Neural Machine Translation in Decoding Phase","10.63317\u002F4wg33qmmhczy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-145","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F163.pdf","dou-etal-2018-dynamic",[3609,3612,3613,3615,3618],{"paper_id":3602,"author_seq":247,"given_name":3610,"surname":3611,"affiliation":63,"orcid":63},"Zi-Yi","Dou",{"paper_id":3602,"author_seq":232,"given_name":2717,"surname":2438,"affiliation":63,"orcid":63},{"paper_id":3602,"author_seq":218,"given_name":3614,"surname":3506,"affiliation":63,"orcid":63},"Shu-Jian",{"paper_id":3602,"author_seq":203,"given_name":3616,"surname":3617,"affiliation":63,"orcid":63},"Xin-Yu","Dai",{"paper_id":3602,"author_seq":188,"given_name":3619,"surname":2401,"affiliation":63,"orcid":63},"Jia-Jun","The past several years have witnessed the rapid progress of end-to-end Neural Machine Translation (NMT). However, there exists discrepancy between training and inference in NMT when decoding, which may lead to serious problems since the model might be in a part of the state space it has never seen during training. To address the issue, Scheduled Sampling has been proposed. However, there are certain limitations in Scheduled Sampling and we propose two dynamic oracle-based methods to improve it. We manage to mitigate the discrepancy by changing the training process towards a less guided scheme and meanwhile aggregating the oracle's demonstrations. Experimental results show that the proposed approaches improve translation quality over standard NMT system.",{"paper_id":3622,"title":3623,"year":81,"month":855,"day":63,"doi":3624,"resource_url":3625,"first_page":63,"last_page":63,"pdf_url":3626,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3627,"paper_type":860,"authors":3628,"abstract":3633},"lrec2018-main-146","One Sentence One Model for Neural Machine Translation","10.63317\u002F5ffogmscmeqx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-146","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F195.pdf","li-etal-2018-one",[3629,3631,3632],{"paper_id":3622,"author_seq":247,"given_name":3630,"surname":1591,"affiliation":63,"orcid":63},"Xiaoqing",{"paper_id":3622,"author_seq":232,"given_name":3580,"surname":2381,"affiliation":63,"orcid":63},{"paper_id":3622,"author_seq":218,"given_name":3582,"surname":3583,"affiliation":63,"orcid":63},"Neural machine translation (NMT) becomes a new state of the art and achieves promising translation performance using a simple encoder-decoder neural network.  This neural network is trained once on the parallel corpus and the fixed network is used to translate all the test sentences.  We argue that the general fixed network parameters cannot best fit each specific testing sentences. In this paper, we propose the dynamic NMT which learns a general network as usual, and then fine-tunes the network for each test sentence.  The fine-tune work is done on a small set of the bilingual training data that is obtained through similarity search according to the test sentence.  Extensive experiments demonstrate that this method can significantly improve the translation performance, especially when highly similar sentences are available.",{"paper_id":3635,"title":3636,"year":81,"month":855,"day":63,"doi":3637,"resource_url":3638,"first_page":63,"last_page":63,"pdf_url":3639,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3640,"paper_type":860,"authors":3641,"abstract":3653},"lrec2018-main-147","A Parallel Corpus of Arabic-Japanese News Articles","10.63317\u002F4ekd22eqevn6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-147","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F400.pdf","inoue-etal-2018-parallel",[3642,3645,3648,3650],{"paper_id":3635,"author_seq":247,"given_name":3643,"surname":3644,"affiliation":63,"orcid":63},"Go","Inoue",{"paper_id":3635,"author_seq":232,"given_name":3646,"surname":3647,"affiliation":63,"orcid":63},"Nizar","Habash",{"paper_id":3635,"author_seq":218,"given_name":3649,"surname":2464,"affiliation":63,"orcid":63},"Yuji",{"paper_id":3635,"author_seq":203,"given_name":3651,"surname":3652,"affiliation":63,"orcid":63},"Hiroyuki","Aoyama","Much work has been done on machine translation between major language pairs including Arabic-English and English-Japanese thanks to the availability of large-scale parallel corpora with manually verified subsets of parallel sentences. However, there has been little research conducted on the Arabic-Japanese language pair due to its parallel-data scarcity, despite being a good example of interestingly contrasting differences in typology. In this paper, we describe the creation process and statistics of the Arabic-Japanese portion of the TUFS Media Corpus, a parallel corpus of translated news articles collected at Tokyo University of Foreign Studies (TUFS). Part of the corpus is manually aligned at the sentence level for development and testing. The corpus is provided in two formats: A document-level parallel corpus in XML format, and a sentence-level parallel corpus in plain text format. We also report the first results of Arabic-Japanese phrase-based machine translation trained on our corpus.",{"paper_id":3655,"title":3656,"year":81,"month":855,"day":63,"doi":3657,"resource_url":3658,"first_page":63,"last_page":63,"pdf_url":3659,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3660,"paper_type":860,"authors":3661,"abstract":3671},"lrec2018-main-148","Examining the Tip of the Iceberg: A Data Set for Idiom Translation","10.63317\u002F2yehgypja4bs","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-148","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F432.pdf","fadaee-etal-2018-examining",[3662,3665,3668],{"paper_id":3655,"author_seq":247,"given_name":3663,"surname":3664,"affiliation":63,"orcid":63},"Marzieh","Fadaee",{"paper_id":3655,"author_seq":232,"given_name":3666,"surname":3667,"affiliation":63,"orcid":63},"Arianna","Bisazza",{"paper_id":3655,"author_seq":218,"given_name":3669,"surname":3670,"affiliation":63,"orcid":63},"Christof","Monz","Neural Machine Translation (NMT) has been widely used in recent years with significant improvements for many language pairs. Although state-of-the-art NMT systems are generating progressively better translations, idiom translation remains one of the open challenges in this field. Idioms, a category of multiword expressions, are an interesting language phenomenon where the overall meaning of the expression cannot be composed from the meanings of its parts. A first important challenge is the lack of dedicated data sets for learning and evaluating idiom translation. In this paper we address this problem by creating the first large-scale data set for idiom translation. Our data set is automatically extracted from a widely used German$English translation corpus and includes, for each language direction, a targeted evaluation set where all sentences contain idioms and a regular training corpus where sentences including idioms are marked. We release this data set and use it to perform preliminary NMT experiments as the first step towards better idiom translation.",{"paper_id":3673,"title":3674,"year":81,"month":855,"day":63,"doi":3675,"resource_url":3676,"first_page":63,"last_page":63,"pdf_url":3677,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3678,"paper_type":860,"authors":3679,"abstract":3692},"lrec2018-main-149","Automatic Enrichment of Terminological Resources: the IATE RDF Example","10.63317\u002F4pm5rtxujcff","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-149","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F541.pdf","arcan-etal-2018-automatic",[3680,3683,3686,3689],{"paper_id":3673,"author_seq":247,"given_name":3681,"surname":3682,"affiliation":63,"orcid":63},"Mihael","Arcan",{"paper_id":3673,"author_seq":232,"given_name":3684,"surname":3685,"affiliation":63,"orcid":63},"Elena","Montiel-Ponsoda",{"paper_id":3673,"author_seq":218,"given_name":3687,"surname":3688,"affiliation":63,"orcid":63},"John P.","McCrae",{"paper_id":3673,"author_seq":203,"given_name":3690,"surname":3691,"affiliation":63,"orcid":63},"Paul","Buitelaar","Terminological resources have proven necessary in many organizations and institutions to ensure communication between experts. However, the maintenance of these resources is a very time-consuming and expensive process. Therefore, the work described in this contribution aims to automate the maintenance process of such resources. As an example, we demonstrate enriching the RDF version of IATE with new terms in the languages for which no translation was available, as well as with domain-disambiguated sentences and information about usage frequency. This is achieved by relying on machine translation trained on parallel corpora that contains the terms in question and multilingual word sense disambiguation performed on the context provided by the sentences. Our results show that for most languages translating the terms within a disambiguated context significantly outperforms the approach with randomly selected sentences.",{"paper_id":3694,"title":3695,"year":81,"month":855,"day":63,"doi":3696,"resource_url":3697,"first_page":63,"last_page":63,"pdf_url":3698,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3699,"paper_type":860,"authors":3700,"abstract":3706},"lrec2018-main-150","A Comparative Study of Extremely Low-Resource Transliteration of the World’s Languages","10.63317\u002F3stwn8v7fk7k","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-150","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F774.pdf","wu-yarowsky-2018-comparative",[3701,3704],{"paper_id":3694,"author_seq":247,"given_name":3702,"surname":3703,"affiliation":63,"orcid":63},"Winston","Wu",{"paper_id":3694,"author_seq":232,"given_name":1199,"surname":3705,"affiliation":63,"orcid":63},"Yarowsky","Transliteration from low-resource languages is difficult, in large part due to the small amounts of data available for training transliteration systems. In this paper, we evaluate the effectiveness of several translation methods in the task of transliterating around 1000 Bible names from 591 languages into English. In this extremely low-resource task, we found that a phrase-based MT system performs much better than other methods, including a g2p system and a neural MT system. However, by combining the data and training a single neural system, we discovered significant gains over single-language systems. We release the output from each system for comparative analysis.",{"paper_id":3708,"title":3709,"year":81,"month":855,"day":63,"doi":3710,"resource_url":3711,"first_page":63,"last_page":63,"pdf_url":3712,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3713,"paper_type":860,"authors":3714,"abstract":3724},"lrec2018-main-151","Translating Web Search Queries into Natural Language Questions","10.63317\u002F27kj83hk39ot","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-151","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F805.pdf","kumar-etal-2018-translating",[3715,3718,3721],{"paper_id":3708,"author_seq":247,"given_name":3716,"surname":3717,"affiliation":63,"orcid":63},"Adarsh","Kumar",{"paper_id":3708,"author_seq":232,"given_name":3719,"surname":3720,"affiliation":63,"orcid":63},"Sandipan","Dandapat",{"paper_id":3708,"author_seq":218,"given_name":3722,"surname":3723,"affiliation":63,"orcid":63},"Sushil","Chordia","Users often query a search engine with a specific question in mind and often these queries are keywords or sub-sentential fragments. In this paper, we are proposing a method to generate well-formed natural language question from a given keyword-based query, which has the same question intent as the query.Conversion of keyword based web query into a well formed question has lots of applications in search engines, Community Question Answering (CQA) website and bots communication. We found a synergy between query-to-question problem with standard machine translation (MT) task. We have used both Statistical MT (SMT) and Neural MT(NMT) models to generate the questions from query. We have observed that MT models performs well in terms of both automatic and human evaluation.",{"paper_id":3726,"title":3727,"year":81,"month":855,"day":63,"doi":3728,"resource_url":3729,"first_page":63,"last_page":63,"pdf_url":3730,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3731,"paper_type":860,"authors":3732,"abstract":3739},"lrec2018-main-152","Construction of a Japanese Word Similarity Dataset","10.63317\u002F26te6gx7k2h4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-152","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F96.pdf","sakaizawa-komachi-2018-construction",[3733,3736],{"paper_id":3726,"author_seq":247,"given_name":3734,"surname":3735,"affiliation":63,"orcid":63},"Yuya","Sakaizawa",{"paper_id":3726,"author_seq":232,"given_name":3737,"surname":3738,"affiliation":63,"orcid":63},"Mamoru","Komachi","An evaluation of distributed word representation is generally conducted using a word similarity task and\u002For a word analogy task. There are many datasets readily available for these tasks in English. However, evaluating distributed representation in languages that do not have such resources (e.g., Japanese) is difficult. Therefore, as a first step toward evaluating distributed representations in Japanese, we constructed a Japanese word similarity dataset. To the best of our knowledge, our dataset is the first resource that can be used to evaluate distributed representations in Japanese. Moreover, our dataset contains various parts of speech and includes rare words in addition to common words.",{"paper_id":3741,"title":3742,"year":81,"month":855,"day":63,"doi":3743,"resource_url":3744,"first_page":63,"last_page":63,"pdf_url":3745,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3746,"paper_type":860,"authors":3747,"abstract":3759},"lrec2018-main-153","Acquiring Verb Classes Through Bottom-Up Semantic Verb Clustering","10.63317\u002F4qjhfpzuxeam","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-153","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F116.pdf","majewska-etal-2018-acquiring",[3748,3751,3754,3757],{"paper_id":3741,"author_seq":247,"given_name":3749,"surname":3750,"affiliation":63,"orcid":63},"Olga","Majewska",{"paper_id":3741,"author_seq":232,"given_name":3752,"surname":3753,"affiliation":63,"orcid":63},"Diana","McCarthy",{"paper_id":3741,"author_seq":218,"given_name":3755,"surname":3756,"affiliation":63,"orcid":63},"Ivan","Vulić",{"paper_id":3741,"author_seq":203,"given_name":884,"surname":3758,"affiliation":63,"orcid":63},"Korhonen","In this paper, we present the first analysis of bottom-up manual semantic clustering of verbs in three languages, English, Polish and Croatian. Verb classes including syntactic and semantic information have been shown to support many NLP tasks by allowing abstraction from individual words and thereby alleviating data sparseness. The availability of such classifications is however still non-existent or limited in most languages. While a range of automatic verb classification approaches have been proposed, high-quality resources and gold standards are needed for evaluation and to improve the performance of NLP systems. We investigate whether semantic verb classes in three different languages can be reliably obtained from native speakers without linguistics training. The analysis of inter-annotator agreement shows an encouraging degree of overlap in the classifications produced for each language individually, as well as across all three languages. Comparative examination of the resultant classifications provides interesting insights into cross-linguistic semantic commonalities and patterns of ambiguity.",{"paper_id":3761,"title":3762,"year":81,"month":855,"day":63,"doi":3763,"resource_url":3764,"first_page":63,"last_page":63,"pdf_url":3765,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3766,"paper_type":860,"authors":3767,"abstract":3778},"lrec2018-main-154","Constructing High Quality Sense-specific Corpus and Word Embedding via Unsupervised Elimination of Pseudo Multi-sense","10.63317\u002F2rfuebsvm4hx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-154","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F118.pdf","shi-etal-2018-constructing",[3768,3771,3773,3775],{"paper_id":3761,"author_seq":247,"given_name":3769,"surname":3770,"affiliation":63,"orcid":63},"Haoyue","Shi",{"paper_id":3761,"author_seq":232,"given_name":3772,"surname":1588,"affiliation":63,"orcid":63},"Xihao",{"paper_id":3761,"author_seq":218,"given_name":1703,"surname":3774,"affiliation":63,"orcid":63},"Sun",{"paper_id":3761,"author_seq":203,"given_name":3776,"surname":3777,"affiliation":63,"orcid":63},"Junfeng","Hu","Multi-sense word embedding is an important extension of neural word embeddings. By leveraging context of each word instance, multi-prototype version of word embeddings were accomplished to represent the multi-senses.  Unfortunately, this kind of context based approach inevitably produces multiple senses which should actually be a single one, suffering from the various context of a word. Shi et al.(2016) used WordNet to evaluate the neighborhood similarity of each sense pair to detect such pseudo multi-senses. In this paper, a novel framework for unsupervised corpus sense tagging is presented, which mainly contains four steps: (a) train multi-sense word embeddings on the given corpus, using existing multi-sense word embedding frameworks; (b) detect pseudo multi-senses in the obtained embeddings, without requirement to any extra language resources; (c) label each word in the corpus with a specific sense tag, with respect to the result of pseudo multi-sense detection; (d) re-train multi-sense word embeddings with the pre-selected sense tags. We evaluate our framework by training word embeddings with the obtained sense specific corpus. On the tasks of word similarity, word analogy as well as sentence understanding, the embeddings trained on sense-specific corpus obtain better results than the basic strategy which is applied in step (a).",{"paper_id":3780,"title":3781,"year":81,"month":855,"day":63,"doi":3782,"resource_url":3783,"first_page":63,"last_page":63,"pdf_url":3784,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3785,"paper_type":860,"authors":3786,"abstract":3789},"lrec2018-main-155","Urdu Word Embeddings","10.63317\u002F5dz3iz2hbqcy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-155","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F148.pdf","haider-2018-urdu",[3787],{"paper_id":3780,"author_seq":247,"given_name":3788,"surname":3199,"affiliation":63,"orcid":63},"Samar","Representing words as vectors which encode their semantic properties is an important component in natural language processing. Recent advances in distributional semantics have led to the rise of neural network-based models that use unsupervised learning to represent words as dense, distributed vectors, called `word embeddings'. These embeddings have led to breakthroughs in performance in multiple natural language processing applications, and also hold the key to improving natural language processing for low-resource languages by helping machine learning algorithms learn patterns more easily from these richer representations of words, thereby allowing better generalization from less data. In this paper, we train the skip-gram model on more than 140 million Urdu words to create the first large-scale word embeddings for the Urdu language. We analyze the quality of the learned embeddings by looking at the closest neighbours to different words in the vector space and find that they capture a high degree of syntactic and semantic similarity between words. We evaluate this quantitatively by experimenting with different vector dimensionalities and context window sizes and measuring their performance on Urdu translations of standard word similarity tasks. The embeddings are made freely available in order to advance research on Urdu language processing.",{"paper_id":3791,"title":3792,"year":81,"month":855,"day":63,"doi":3793,"resource_url":3794,"first_page":63,"last_page":63,"pdf_url":3795,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3796,"paper_type":860,"authors":3797,"abstract":3805},"lrec2018-main-156","Social Image Tags as a Source of Word Embeddings: A Task-oriented Evaluation","10.63317\u002F3peezg3yshic","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-156","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F247.pdf","hasegawa-etal-2018-social",[3798,3800,3803],{"paper_id":3791,"author_seq":247,"given_name":3485,"surname":3799,"affiliation":63,"orcid":63},"Hasegawa",{"paper_id":3791,"author_seq":232,"given_name":3801,"surname":3802,"affiliation":63,"orcid":63},"Tetsunori","Kobayashi",{"paper_id":3791,"author_seq":218,"given_name":2913,"surname":3804,"affiliation":63,"orcid":63},"Hayashi","Distributional hypothesis has been playing a central role in statistical NLP. Recently, however, its limitation in incorporating perceptual and empirical knowledge is noted, eliciting a field of perceptually grounded computational semantics. Typical sources of features in such a research are image datasets, where images are accompanied by linguistic tags and\u002For descriptions. Mainstream approaches employ machine learning techniques to integrate\u002Fcombine visual features with linguistic features. In contrast to or supplementing these approaches, this study assesses the effectiveness of social image tags in generating word embeddings, and argues that these generated representations exhibit somewhat different and favorable behaviors from corpus-originated representations. More specifically, we generated word embeddings by using image tags obtained from a large social image dataset YFCC100M, which collects Flickr images and the associated tags. We evaluated the efficacy of generated word embeddings with standard semantic similarity\u002Frelatedness tasks, which showed that comparable performances with corpus-originated word embeddings were attained. These results further suggest that the generated embeddings could be effective in discriminating synonyms and antonyms, which has been an issue in distributional hypothesis-based approaches. In summary, social image tags can be utilized as yet another source of visually enforced features, provided the amount of available tags is large enough.",{"paper_id":3807,"title":3808,"year":81,"month":855,"day":63,"doi":3809,"resource_url":3810,"first_page":63,"last_page":63,"pdf_url":3811,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3812,"paper_type":860,"authors":3813,"abstract":3820},"lrec2018-main-157","Towards AMR-BR: A SemBank for Brazilian Portuguese Language","10.63317\u002F2vrivqxeyc4c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-157","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F366.pdf","anchieta-pardo-2018-towards",[3814,3817],{"paper_id":3807,"author_seq":247,"given_name":3815,"surname":3816,"affiliation":63,"orcid":63},"Rafael","Anchiêta",{"paper_id":3807,"author_seq":232,"given_name":3818,"surname":3819,"affiliation":63,"orcid":63},"Thiago","Pardo","We present in this paper an effort to build an AMR (Abstract Meaning Representation) annotated corpus (a semantic bank) for Brazilian Portuguese. AMR is a recent and prominent meaning representation with good acceptance and several applications in the Natural Language Processing area. Following what has been done for other languages, and using an alignment-based approach for annotation, we annotated the Little Prince book, which went into the public domain and explored some language-specific annotation issues.",{"paper_id":3822,"title":3823,"year":81,"month":855,"day":63,"doi":3824,"resource_url":3825,"first_page":63,"last_page":63,"pdf_url":3826,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3827,"paper_type":860,"authors":3828,"abstract":3839},"lrec2018-main-158","Towards a Welsh Semantic Annotation System","10.63317\u002F5ca3jue99dt7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-158","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F458.pdf","piao-etal-2018-towards",[3829,3832,3834,3837],{"paper_id":3822,"author_seq":247,"given_name":3830,"surname":3831,"affiliation":63,"orcid":63},"Scott","Piao",{"paper_id":3822,"author_seq":232,"given_name":3690,"surname":3833,"affiliation":63,"orcid":63},"Rayson",{"paper_id":3822,"author_seq":218,"given_name":3835,"surname":3836,"affiliation":63,"orcid":63},"Dawn","Knight",{"paper_id":3822,"author_seq":203,"given_name":1726,"surname":3838,"affiliation":63,"orcid":63},"Watkins","Automatic semantic annotation of natural language data is an important task in Natural Language Processing, and a variety of semantic taggers have been developed for this task, particularly for English. However, for many languages, particularly for low-resource languages, such tools are yet to be developed. In this paper, we report on the development of an automatic Welsh semantic annotation tool (named CySemTagger) in the CorCenCC Project, which will facilitate semantic-level analysis of Welsh language data on a large scale. Based on Lancaster’s USAS semantic tagger framework, this tool tags words in Welsh texts with semantic tags from a semantic classification scheme, and is designed to be compatible with multiple Welsh POS taggers and POS tagsets by mapping different tagsets into a core shared POS tagset that is used internally by CySemTagger. Our initial evaluation shows that the tagger can cover up to 91.78% of words in Welsh text. This tagger is under continuous development, and will provide a critical tool for Welsh language corpus and information processing at semantic level.",{"paper_id":3841,"title":3842,"year":81,"month":855,"day":63,"doi":3843,"resource_url":3844,"first_page":63,"last_page":63,"pdf_url":3845,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3846,"paper_type":860,"authors":3847,"abstract":3857},"lrec2018-main-159","Semantic Frame Parsing for Information Extraction : the CALOR corpus","10.63317\u002F5hpmtq47z62u","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-159","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F527.pdf","marzinotto-etal-2018-semantic",[3848,3850,3851,3854,3856],{"paper_id":3841,"author_seq":247,"given_name":1281,"surname":3849,"affiliation":63,"orcid":63},"Marzinotto",{"paper_id":3841,"author_seq":232,"given_name":1124,"surname":1125,"affiliation":63,"orcid":63},{"paper_id":3841,"author_seq":218,"given_name":3852,"surname":3853,"affiliation":63,"orcid":63},"Frederic","Bechet",{"paper_id":3841,"author_seq":203,"given_name":3855,"surname":1122,"affiliation":63,"orcid":63},"Geraldine",{"paper_id":3841,"author_seq":188,"given_name":1127,"surname":1128,"affiliation":63,"orcid":63},"This paper presents a publicly available corpus of French encyclopedic history texts annotated according to the Berkeley FrameNet formalism. The main difference in our approach compared to previous works on semantic parsing with FrameNet is that we are not interested here in full text parsing but rather on partial parsing. The goal is to select from the FrameNet resources the minimal set of frames that are going to be useful for the applicative framework targeted, in our case Information Extraction from encyclopedic documents. Such an approach leverage the manual annotation of larger corpus than those obtained through full text parsing and therefore open the door to alternative methods for Frame parsing than those used so far on the FrameNet 1.5 benchmark corpus. The approaches compared in this study rely on an integrated sequence labeling model which jointly optimizes frame identification and semantic role segmentation and identification. The models compared are CRFs and multitasks bi-LSTMs.",{"paper_id":3859,"title":3860,"year":81,"month":855,"day":63,"doi":3861,"resource_url":3862,"first_page":63,"last_page":63,"pdf_url":3863,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3864,"paper_type":860,"authors":3865,"abstract":3874},"lrec2018-main-160","Using a Corpus of English and Chinese Political Speeches for Metaphor Analysis","10.63317\u002F2a8cuutk5yc9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-160","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F571.pdf","ahrens-etal-2018-using",[3866,3869,3872],{"paper_id":3859,"author_seq":247,"given_name":3867,"surname":3868,"affiliation":63,"orcid":63},"Kathleen","Ahrens",{"paper_id":3859,"author_seq":232,"given_name":3870,"surname":3871,"affiliation":63,"orcid":63},"Huiheng","Zeng",{"paper_id":3859,"author_seq":218,"given_name":3873,"surname":2413,"affiliation":63,"orcid":63},"Shun-han Rebekah","In this article, we present details of our corpus of political speeches and introduce using the corpus for metaphor analysis in political discourse. Although specialized corpora on a variety of topics are now easily available, online political corpora available for public use are scarce. The database our research team has developed contains more than six million English and Chinese political speeches and is currently available free online. Researchers in many fields are able to use the multiple search functions on the website for their specific research purposes. In particular, the corpus is useful for researchers focusing on political speeches and conceptual metaphor analyses. From the perspective of metaphor study, we have taken advantage of several functions to facilitate the corpus-based metaphor analyses. In short, this database enriches the current bilingual resources and contributes to the evaluation of political language by linguists and political scientists.",{"paper_id":3876,"title":3877,"year":81,"month":855,"day":63,"doi":3878,"resource_url":3879,"first_page":63,"last_page":63,"pdf_url":3880,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3881,"paper_type":860,"authors":3882,"abstract":3896},"lrec2018-main-161","A Multi- versus a Single-classifier Approach for the Identification of Modality in the Portuguese Language","10.63317\u002F3rx2ukbdbzs7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-161","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F616.pdf","sequeira-etal-2018-multi",[3883,3886,3889,3892,3895],{"paper_id":3876,"author_seq":247,"given_name":3884,"surname":3885,"affiliation":63,"orcid":63},"João","Sequeira",{"paper_id":3876,"author_seq":232,"given_name":3887,"surname":3888,"affiliation":63,"orcid":63},"Teresa","Gonçalves",{"paper_id":3876,"author_seq":218,"given_name":3890,"surname":3891,"affiliation":63,"orcid":63},"Paulo","Quaresma",{"paper_id":3876,"author_seq":203,"given_name":3893,"surname":3894,"affiliation":63,"orcid":63},"Amália","Mendes",{"paper_id":3876,"author_seq":188,"given_name":2283,"surname":2284,"affiliation":63,"orcid":63},"This work presents a comparative study between two different approaches to build an automatic classification system for Modality values in the Portuguese language. One approach uses a single multi-class classifier with the full dataset that includes eleven modal verbs; the other builds different classifiers, one for each verb. The performance is measured using precision, recall and F 1 . Due to the unbalanced nature of the dataset a weighted average approach was calculated for each metric. We use support vector machines as our classifier and experimented with various SVM kernels to find the optimal classifier for the task at hand. We experimented with several different types of feature attributes representing parse tree information and compare these complex feature representation against a simple bag-of-words feature representation as baseline. The best obtained F 1 values are above 0.60 and from the results it is possible to conclude that there is no significant difference between both approaches.",{"paper_id":3898,"title":3899,"year":81,"month":855,"day":63,"doi":3900,"resource_url":3901,"first_page":63,"last_page":63,"pdf_url":3902,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3903,"paper_type":860,"authors":3904,"abstract":3917},"lrec2018-main-162","All-words Word Sense Disambiguation Using Concept Embeddings","10.63317\u002F4dp2tembwj47","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-162","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F100.pdf","suzuki-etal-2018-words",[3905,3908,3911,3914,3915],{"paper_id":3898,"author_seq":247,"given_name":3906,"surname":3907,"affiliation":63,"orcid":63},"Rui","Suzuki",{"paper_id":3898,"author_seq":232,"given_name":3909,"surname":3910,"affiliation":63,"orcid":63},"Kanako","Komiya",{"paper_id":3898,"author_seq":218,"given_name":3912,"surname":3913,"affiliation":63,"orcid":63},"Masayuki","Asahara",{"paper_id":3898,"author_seq":203,"given_name":2466,"surname":2636,"affiliation":63,"orcid":63},{"paper_id":3898,"author_seq":188,"given_name":3651,"surname":3916,"affiliation":63,"orcid":63},"Shinnou","All-words word sense disambiguation (all-words WSD) is the task of identifying the senses of all words in a document. Since the sense of a word depends on the context, such as the surrounding words, similar words are believed to have similar sets of surrounding words. We therefore predict the target word senses by calculating the distances between the surrounding word vectors of the target words and their synonyms using word embeddings. In addition, we introduce the new idea of concept embeddings, constructed from concept tag sequences created from the results of previous prediction steps. We predict the target word senses using the distances between surrounding word vectors constructed from word and concept embeddings, via a bootstrapped iterative process. Experimental results show that these concept embeddings were able to improve the performance of Japanese all-words WSD.",{"paper_id":3919,"title":3920,"year":81,"month":855,"day":63,"doi":3921,"resource_url":3922,"first_page":63,"last_page":63,"pdf_url":3923,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3924,"paper_type":860,"authors":3925,"abstract":3933},"lrec2018-main-163","Enhancing Modern Supervised Word Sense Disambiguation Models by Semantic Lexical Resources","10.63317\u002F44qnvjw7fk4e","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-163","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F112.pdf","melacci-etal-2018-enhancing",[3926,3928,3931],{"paper_id":3919,"author_seq":247,"given_name":2733,"surname":3927,"affiliation":63,"orcid":63},"Melacci",{"paper_id":3919,"author_seq":232,"given_name":3929,"surname":3930,"affiliation":63,"orcid":63},"Achille","Globo",{"paper_id":3919,"author_seq":218,"given_name":1956,"surname":3932,"affiliation":63,"orcid":63},"Rigutini","Supervised models for Word Sense Disambiguation (WSD) currently yield to state-of-the-art results in the most popular benchmarks. Despite the recent introduction of Word Embeddings and Recurrent Neural Networks to design powerful context-related features, the interest in improving WSD models using Semantic Lexical Resources (SLRs) is mostly restricted to knowledge-based approaches. In this paper, we enhance \"modern\" supervised WSD models exploiting two popular SLRs: WordNet and WordNet Domains. We propose an effective way to introduce semantic features into the classifiers, and we consider using the SLR structure to augment the training data. We study the effect of different types of semantic features, investigating their interaction with local contexts encoded by means of mixtures of Word Embeddings or Recurrent Neural Networks, and we extend the proposed model into a novel multi-layer architecture for WSD. A detailed experimental comparison in the recent Unified Evaluation Framework (Raganato et al., 2017) shows that the proposed approach leads to supervised models that compare favourably with the state-of-the art.",{"paper_id":3935,"title":3936,"year":81,"month":855,"day":63,"doi":3937,"resource_url":3938,"first_page":63,"last_page":63,"pdf_url":3939,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3940,"paper_type":860,"authors":3941,"abstract":3953},"lrec2018-main-164","An Unsupervised Word Sense Disambiguation System for Under-Resourced Languages","10.63317\u002F3eof7uuaauwd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-164","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F182.pdf","ustalov-etal-2018-unsupervised",[3942,3945,3948,3949,3951,3952],{"paper_id":3935,"author_seq":247,"given_name":3943,"surname":3944,"affiliation":63,"orcid":63},"Dmitry","Ustalov",{"paper_id":3935,"author_seq":232,"given_name":3946,"surname":3947,"affiliation":63,"orcid":63},"Denis","Teslenko",{"paper_id":3935,"author_seq":218,"given_name":2736,"surname":2737,"affiliation":63,"orcid":63},{"paper_id":3935,"author_seq":203,"given_name":2876,"surname":3950,"affiliation":63,"orcid":63},"Chernoskutov",{"paper_id":3935,"author_seq":188,"given_name":1367,"surname":2739,"affiliation":63,"orcid":63},{"paper_id":3935,"author_seq":172,"given_name":2741,"surname":2742,"affiliation":63,"orcid":63},"In this paper, we present Watasense, an unsupervised system for word sense disambiguation. Given a sentence, the system chooses the most relevant sense of each input word with respect to the semantic similarity between the given sentence and the synset constituting the sense of the target word. Watasense has two modes of operation. The sparse mode uses the traditional vector space model to estimate the most similar word sense corresponding to its context. The dense mode, instead, uses synset embeddings to cope with the sparsity problem. We describe the architecture of the present system and also conduct its evaluation on three different lexical semantic resources for Russian. We found that the dense mode substantially outperforms the sparse one on all datasets according to the adjusted Rand index.",{"paper_id":3955,"title":3956,"year":81,"month":855,"day":63,"doi":3957,"resource_url":3958,"first_page":63,"last_page":63,"pdf_url":3959,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3960,"paper_type":860,"authors":3961,"abstract":3970},"lrec2018-main-165","Unsupervised Korean Word Sense Disambiguation using CoreNet","10.63317\u002F3evzrnrvbz56","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-165","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F224.pdf","han-etal-2018-unsupervised",[3962,3964,3967,3968,3969],{"paper_id":3955,"author_seq":247,"given_name":3963,"surname":2403,"affiliation":63,"orcid":63},"Kijong",{"paper_id":3955,"author_seq":232,"given_name":3965,"surname":3966,"affiliation":63,"orcid":63},"Sangha","Nam",{"paper_id":3955,"author_seq":218,"given_name":1103,"surname":1104,"affiliation":63,"orcid":63},{"paper_id":3955,"author_seq":203,"given_name":1100,"surname":1101,"affiliation":63,"orcid":63},{"paper_id":3955,"author_seq":188,"given_name":1109,"surname":1110,"affiliation":63,"orcid":63},"In this study, we investigated unsupervised learning based Korean word sense disambiguation (WSD) using CoreNet, a Korean lexical semantic network. To facilitate the application of WSD to practical natural language processing problems, a reasonable method is required to distinguish between sense candidates. We therefore performed coarse-grained Korean WSD studies while utilizing the hierarchical semantic categories of CoreNet to distinguish between sense candidates. In our unsupervised approach, we applied a knowledge-based model that incorporated a Markov random field and dependency parsing to the Korean language in addition to utilizing the semantic categories of CoreNet. Our experimental results demonstrate that the developed CoreNet based coarse-grained WSD technique exhibited an 80.9% accuracy on the datasets we constructed, and was proven to be effective for practical applications.",{"paper_id":3972,"title":3973,"year":81,"month":855,"day":63,"doi":3974,"resource_url":3975,"first_page":63,"last_page":63,"pdf_url":3976,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3977,"paper_type":860,"authors":3978,"abstract":3986},"lrec2018-main-166","UFSAC: Unification of Sense Annotated Corpora and Tools","10.63317\u002F3osp8gskqh3j","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-166","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F250.pdf","vial-etal-2018-ufsac",[3979,3981,3983],{"paper_id":3972,"author_seq":247,"given_name":2108,"surname":3980,"affiliation":63,"orcid":63},"Vial",{"paper_id":3972,"author_seq":232,"given_name":2202,"surname":3982,"affiliation":63,"orcid":63},"Lecouteux",{"paper_id":3972,"author_seq":218,"given_name":3984,"surname":3985,"affiliation":63,"orcid":63},"Didier","Schwab","In Word Sense Disambiguation, sense annotated corpora are often essential for evaluating a system and also valuable in order to reach a good efficiency. Always created for a specific purpose, there are today a dozen of sense annotated English corpora, in various formats and using different versions of WordNet. The main hypothesis of this work is that it should be possible to build a disambiguation system by using any of these corpora during the training phase or during the testing phase regardless of their original purpose. In this article, we present UFSAC: a format of corpus that can be used for either training or testing a disambiguation system, and the process we followed for constructing this format. We give to the community the whole set of sense annotated English corpora that we know, in this unified format, when the copyright allows it, with sense keys converted to the last version of WordNet. We also provide the source code for building these corpora from their original data, and a complete Java API for manipulating corpora in this format. The whole resource is available at the following URL: https:\u002F\u002Fgithub.com\u002Fgetalp\u002FUFSAC.",{"paper_id":3988,"title":3989,"year":81,"month":855,"day":63,"doi":3990,"resource_url":3991,"first_page":63,"last_page":63,"pdf_url":3992,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":3993,"paper_type":860,"authors":3994,"abstract":3999},"lrec2018-main-167","Retrofitting Word Representations for Unsupervised Sense Aware Word Similarities","10.63317\u002F2b3pzp2jiifk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-167","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F290.pdf","remus-biemann-2018-retrofitting",[3995,3998],{"paper_id":3988,"author_seq":247,"given_name":3996,"surname":3997,"affiliation":63,"orcid":63},"Steffen","Remus",{"paper_id":3988,"author_seq":232,"given_name":1367,"surname":2739,"affiliation":63,"orcid":63},"Standard word embeddings lack the possibility to distinguish senses of a word by projecting them to exactly one vector. This has a negative effect particularly when computing similarity scores between words using standard vector-based similarity measures such as cosine similarity. We argue that minor senses play an important role in word similarity computations, hence we use an unsupervised sense inventory resource to retrofit monolingual word embeddings, producing sense-aware embeddings. Using retrofitted sense-aware embeddings, we show improved word similarity and relatedness results on multiple word embeddings and multiple established word similarity tasks, sometimes up to an impressive margin of 0.15 Spearman correlation score.",{"paper_id":4001,"title":4002,"year":81,"month":855,"day":63,"doi":4003,"resource_url":4004,"first_page":63,"last_page":63,"pdf_url":4005,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4006,"paper_type":860,"authors":4007,"abstract":4018},"lrec2018-main-168","FastSense: An Efficient Word Sense Disambiguation Classifier","10.63317\u002F5bpb2b9hymv8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-168","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F736.pdf","uslu-etal-2018-fastsense",[4008,4011,4013,4015],{"paper_id":4001,"author_seq":247,"given_name":4009,"surname":4010,"affiliation":63,"orcid":63},"Tolga","Uslu",{"paper_id":4001,"author_seq":232,"given_name":2736,"surname":4012,"affiliation":63,"orcid":63},"Mehler",{"paper_id":4001,"author_seq":218,"given_name":2554,"surname":4014,"affiliation":63,"orcid":63},"Baumartz",{"paper_id":4001,"author_seq":203,"given_name":4016,"surname":4017,"affiliation":63,"orcid":63},"Wahed","Hemati","The task of Word Sense Disambiguation (WSD) is to determine the meaning of an ambiguous word in a given context. In spite of its importance for most NLP pipelines, WSD can still be seen to be unsolved. The reason is that we currently lack tools for WSD that handle big data – “big” in terms of the number of ambiguous words and in terms of the overall number of senses to be distinguished. This desideratum is exactly the objective of fastSense, an efficient neural network-based tool for word sense disambiguation introduced in this paper. We train and test fastSense by means of the disambiguation pages of the German Wikipedia. In addition, we evaluate fastSense in the context of Senseval and SemEval. By reference to Senseval and SemEval we additionally perform a parameter study. We show that fastSense can process huge amounts of data quickly and also surpasses state-of-the-art tools in terms of F-measure.",{"paper_id":4020,"title":4021,"year":81,"month":855,"day":63,"doi":4022,"resource_url":4023,"first_page":63,"last_page":63,"pdf_url":4024,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4025,"paper_type":860,"authors":4026,"abstract":4040},"lrec2018-main-169","Text Annotation Graphs: Annotating Complex Natural Language Phenomena","10.63317\u002F4v56fmovi3iq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-169","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F218.pdf","forbes-etal-2018-text",[4027,4030,4033,4036,4039],{"paper_id":4020,"author_seq":247,"given_name":4028,"surname":4029,"affiliation":63,"orcid":63},"Angus","Forbes",{"paper_id":4020,"author_seq":232,"given_name":4031,"surname":4032,"affiliation":63,"orcid":63},"Kristine","Lee",{"paper_id":4020,"author_seq":218,"given_name":4034,"surname":4035,"affiliation":63,"orcid":63},"Gus","Hahn-Powell",{"paper_id":4020,"author_seq":203,"given_name":4037,"surname":4038,"affiliation":63,"orcid":63},"Marco A.","Valenzuela-Escárcega",{"paper_id":4020,"author_seq":188,"given_name":2819,"surname":2820,"affiliation":63,"orcid":63},"This paper introduces a new web-based software tool for annotating text, Text Annotation Graphs, or TAG. It provides functionality for representing complex relationships between words and word phrases that are not available in other software tools, including the ability to define and visualize relationships between the relationships themselves (semantic hypergraphs). Additionally, we include an approach to representing text annotations in which annotation subgraphs, or semantic summaries, are used to show relationships outside of the sequential context of the text itself. Users can use these subgraphs to quickly find similar structures within the current document or external annotated documents. Initially, TAG was developed to support information extraction tasks on a large database of biomedical articles. However, our software is flexible enough to support a wide range of annotation tasks for many domains. Examples are provided that showcase TAG's capabilities on morphological parsing and event extraction tasks.",{"paper_id":4042,"title":4043,"year":81,"month":855,"day":63,"doi":4044,"resource_url":4045,"first_page":63,"last_page":63,"pdf_url":4046,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4047,"paper_type":860,"authors":4048,"abstract":4055},"lrec2018-main-170","Manzanilla: An Image Annotation Tool for TKB Building","10.63317\u002F57oymfvf9fa6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-170","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F248.pdf","reimerink-leon-arauz-2018-manzanilla",[4049,4052],{"paper_id":4042,"author_seq":247,"given_name":4050,"surname":4051,"affiliation":63,"orcid":63},"Arianne","Reimerink",{"paper_id":4042,"author_seq":232,"given_name":4053,"surname":4054,"affiliation":63,"orcid":63},"Pilar","León-Araúz","Much has been written regarding the importance of combining visual and textual information to enhance knowledge acquisition (Paivio, 1971, 1986; Mayer & Anderson, 1992). However, the combination of images and text still needs further analysis (Faber, 2012; Prieto, 2008; Prieto & Faber, 2012). An in-depth analysis of the features of images provides the means to develop selection criteria for specific representation purposes. The combination of conceptual content, image type based on morphological characteristics, and functional criteria can be used to enhance the selection and annotation of images that explicitly focus on the conceptual propositions that best define concepts in a knowledge base. Manzanilla is an image annotation tool specifically created for EcoLexicon, a multilingual and multimodal terminological knowledge base (TKB) on the environment. It is powered by Camomile (Poignant et al., 2016) according to the selection and annotation criteria resulting from ten years of research on multimodality within the framework of Frame-Based Terminology (FBT; Faber, León-Araúz & Reimerink, 2014). The tool was created to enhance the consistency of knowledge representation through images with the conceptual knowledge in EcoLexicon and to improve image reusability.",{"paper_id":4057,"title":4058,"year":81,"month":855,"day":63,"doi":4059,"resource_url":4060,"first_page":63,"last_page":63,"pdf_url":4061,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4062,"paper_type":860,"authors":4063,"abstract":4068},"lrec2018-main-171","Tools for The Production of Analogical Grids and a Resource of N-gram Analogical Grids in 11 Languages","10.63317\u002F2oa2ezd7h3zb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-171","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F344.pdf","fam-lepage-2018-tools",[4064,4067],{"paper_id":4057,"author_seq":247,"given_name":4065,"surname":4066,"affiliation":63,"orcid":63},"Rashel","Fam",{"paper_id":4057,"author_seq":232,"given_name":2187,"surname":2188,"affiliation":63,"orcid":63},"We release a Python module containing several tools to build analogical grids from words contained in a corpus. The module implements several previously presented algorithms. The tools are language-independent. This permits their use with any language and any writing system. We hope that the tools will ease research in morphology by allowing researchers to automatically obtain structured representations of the vocabulary contained in corpora or linguistic data. We also release analogical grids built on the vocabularies contained in 1,000 corresponding lines of the 11 different language versions of the Europarl corpus v.3. The grids were built on N-grams of different lengths, from words to 6-grams. We hope that the use of structured parallel data will foster research in comparative linguistics.",{"paper_id":4070,"title":4071,"year":81,"month":855,"day":63,"doi":4072,"resource_url":4073,"first_page":63,"last_page":63,"pdf_url":4074,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4075,"paper_type":860,"authors":4076,"abstract":4080},"lrec2018-main-172","The Automatic Annotation of the Semiotic Type of Hand Gestures in Obama’ s Humorous Speeches","10.63317\u002F529up4ibamzw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-172","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F412.pdf","navarretta-2018-automatic",[4077],{"paper_id":4070,"author_seq":247,"given_name":4078,"surname":4079,"affiliation":63,"orcid":63},"Costanza","Navarretta","This paper describes a pilot study act to investigate the semiotic types of hand gestures in video-recorded speeches and their automatic classification. Gestures, which also comprise e.g. head movements and body posture, contribute to the successful delivery of the message by reinforcing what is expressed by speech or by adding new information to what is uttered. The automatic classification of the semiotic type of gestures from their shape description can contribute to their interpretation in human-human communication and in advanced multimodal interactive systems. We annotated and analysed hand gestures produced by Barack Obama during two speeches at the Annual White House Correspondent Dinners and found differences in the contexts in which various hand gesture types were used. Then, we trained machine learning algorithms to classify the semiotic type of the hand gestures. The F-score obtained by the best performing algorithm on the classification of four semiotic types is 0.59. Surprisingly, the shape feature that contributes mostly to classification is the trajectory of the left hand. The results of this study are promising, but they should be tested on more data of different type, produced by different speakers and in more languages.",{"paper_id":4082,"title":4083,"year":81,"month":855,"day":63,"doi":4084,"resource_url":4085,"first_page":63,"last_page":63,"pdf_url":4086,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4087,"paper_type":860,"authors":4088,"abstract":4095},"lrec2018-main-173","WASA: A Web Application for Sequence Annotation","10.63317\u002F5ji4pxvwrw3d","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-173","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F474.pdf","alghamdi-diab-2018-wasa",[4089,4092],{"paper_id":4082,"author_seq":247,"given_name":4090,"surname":4091,"affiliation":63,"orcid":63},"Fahad","AlGhamdi",{"paper_id":4082,"author_seq":232,"given_name":4093,"surname":4094,"affiliation":63,"orcid":63},"Mona","Diab","Data annotation is an important and necessary task for all NLP applications. Designing and implementing a web-based application that enables many annotators to annotate and enter their input into one central database is not a trivial task. These kinds of web-based applications require a consistent and robust backup for the underlying database and support to enhance the efficiency and speed of the annotation. Also, they need to ensure that the annotations are stored with a minimal amount of redundancy in order to take advantage of the available resources(e.g, storage space). In this paper, we introduce WASA,  a web-based annotation system for managing large-scale multilingual Code Switching (CS) data annotation. Although WASA has the ability to perform the annotation for any token sequence with arbitrary tag sets, we will focus on how WASA is used for CS annotation.  The system supports concurrent annotation, handles multiple encodings, allows for several levels of management control,  and enables quality control measures while seamlessly reporting annotation statistics from various perspectives and at different levels of granularity. Moreover, the system is integrated with a robust language specific date prepossessing tool to enhance the speed and efficiency of the annotation. We describe the annotation and the administration interfaces as well as the backend engine.",{"paper_id":4097,"title":4098,"year":81,"month":855,"day":63,"doi":4099,"resource_url":4100,"first_page":63,"last_page":63,"pdf_url":4101,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4102,"paper_type":860,"authors":4103,"abstract":4112},"lrec2018-main-174","Annotation and Quantitative Analysis of Speaker Information in Novel Conversation Sentences in Japanese","10.63317\u002F2ijvnfi6y6w3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-174","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F626.pdf","yamazaki-etal-2018-annotation",[4104,4107,4109],{"paper_id":4097,"author_seq":247,"given_name":4105,"surname":4106,"affiliation":63,"orcid":63},"Makoto","Yamazaki",{"paper_id":4097,"author_seq":232,"given_name":4108,"surname":89,"affiliation":63,"orcid":63},"Yumi",{"paper_id":4097,"author_seq":218,"given_name":4110,"surname":4111,"affiliation":63,"orcid":63},"Wakako","Kashino","This study undertook a quantitative lexicological analysis using attributed speaker information, and reports on the problems of creating standards when annotating speaker information (gender and age) of conversation sentences in novels. In this paper, we performed a comparative analysis of vocabulary use by gender and age of conversation sentences and descriptive part sentences, as well as on the differences between Japanese novels and translations of foreign novels. In addition, a comparison with other spoken language materials was made.",{"paper_id":4114,"title":4115,"year":81,"month":855,"day":63,"doi":4116,"resource_url":4117,"first_page":63,"last_page":63,"pdf_url":4118,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4119,"paper_type":860,"authors":4120,"abstract":4127},"lrec2018-main-175","PDFAnno: a Web-based Linguistic Annotation Tool for PDF Documents","10.63317\u002F3hubowvdb43y","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-175","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F680.pdf","shindo-etal-2018-pdfanno",[4121,4123,4126],{"paper_id":4114,"author_seq":247,"given_name":3651,"surname":4122,"affiliation":63,"orcid":63},"Shindo",{"paper_id":4114,"author_seq":232,"given_name":4124,"surname":4125,"affiliation":63,"orcid":63},"Yohei","Munesada",{"paper_id":4114,"author_seq":218,"given_name":3649,"surname":2464,"affiliation":63,"orcid":63},"We present PDFAnno, a web-based linguistic annotation tool for PDF documents.  PDF has become widespread standard for various types of publications,  however, current tools for linguistic annotation mostly focus on  plain-text documents. PDFAnno offers functions for various types of  linguistic annotations directly on PDF, including named entity, dependency relation, and coreference chain.  Furthermore, for multi-user support, it allows simultaneous visualization  of multi-user's annotations on the single PDF, which is useful for checking  inter-annotator agreement and resolving annotation conflicts.  PDFAnno is freely available under open-source license at  https:\u002F\u002Fgithub.com\u002Fpaperai\u002Fpdfanno.",{"paper_id":4129,"title":4130,"year":81,"month":855,"day":63,"doi":4131,"resource_url":4132,"first_page":63,"last_page":63,"pdf_url":4133,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4134,"paper_type":860,"authors":4135,"abstract":4139},"lrec2018-main-176","A Lightweight Modeling Middleware for Corpus Processing","10.63317\u002F3su7jn3ku4c8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-176","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F691.pdf","gartner-kuhn-2018-lightweight",[4136,4137],{"paper_id":4129,"author_seq":247,"given_name":1771,"surname":2668,"affiliation":63,"orcid":63},{"paper_id":4129,"author_seq":232,"given_name":1278,"surname":4138,"affiliation":63,"orcid":63},"Kuhn","Present-day empirical research in computational or theoretical linguistics has at its disposal an enormous wealth in the form of richly annotated and diverse corpus resources. Especially the points of contact between modalities are areas of exciting new research. However, progress in those areas in particular suffers from poor coverage in terms of visualization or query systems. Many limitations for such tools stem from the non-uniform representations of very diverse resources and the lack of standards that address this problem from the perspective of processing or querying. In this paper we present our framework for modeling arbitrary multi-modal corpus resources in a unified form for processing tools. It serves as a middleware system and combines the expressiveness of general graph-based models with a rich metadata schema to preserve linguistic specificity. By separating data structures and their linguistic interpretations, it assists tools on top of it so that they can in turn allow their users to more efficiently exploit corpus resources.",{"paper_id":4141,"title":4142,"year":81,"month":855,"day":63,"doi":4143,"resource_url":4144,"first_page":63,"last_page":63,"pdf_url":4145,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4146,"paper_type":860,"authors":4147,"abstract":4154},"lrec2018-main-177","An Annotation Language for Semantic Search of Legal Sources","10.63317\u002F3a4vxy7nn7z7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-177","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F728.pdf","nazarenko-etal-2018-annotation",[4148,4150,4152],{"paper_id":4141,"author_seq":247,"given_name":2199,"surname":4149,"affiliation":63,"orcid":63},"Nazarenko",{"paper_id":4141,"author_seq":232,"given_name":3525,"surname":4151,"affiliation":63,"orcid":63},"Levy",{"paper_id":4141,"author_seq":218,"given_name":3370,"surname":4153,"affiliation":63,"orcid":63},"Wyner","While formalizing legal sources is an important challenge, the generation of a formal representation from legal texts has been far less considered and requires considerable expertise. In order to improve the uniformity, richness, and efficiency of legal annotation, it is necessary to experiment with annotations and the annotation process. This paper reports on a first experiment, which was a campaign to annotate legal instruments provided by the Scottish Government's Parliamentary Counsel Office and bearing on Scottish smoking legislation and regulation. A small set of elements related to LegalRuleML was used. An initial guideline manual was produced to annotate the text using annotations related to these elements. The resulting annotated corpus is converted into a LegalRuleML XML compliant document, then made available via an online visualisation and query tool. In the course of annotating the documents, a range of important interpretive and practical issues arose, highlighting the value of a focused study on legal text annotation.",{"paper_id":4156,"title":4157,"year":81,"month":855,"day":63,"doi":4158,"resource_url":4159,"first_page":63,"last_page":63,"pdf_url":4160,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4161,"paper_type":860,"authors":4162,"abstract":4176},"lrec2018-main-178","Resource Interoperability for Sustainable Benchmarking: The Case of Events","10.63317\u002F5bpv9czi3q5s","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-178","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F865.pdf","van-son-etal-2018-resource",[4163,4166,4169,4170,4173],{"paper_id":4156,"author_seq":247,"given_name":4164,"surname":4165,"affiliation":63,"orcid":63},"Chantal","van Son",{"paper_id":4156,"author_seq":232,"given_name":4167,"surname":4168,"affiliation":63,"orcid":63},"Oana","Inel",{"paper_id":4156,"author_seq":218,"given_name":1897,"surname":1898,"affiliation":63,"orcid":63},{"paper_id":4156,"author_seq":203,"given_name":4171,"surname":4172,"affiliation":63,"orcid":63},"Lora","Aroyo",{"paper_id":4156,"author_seq":188,"given_name":4174,"surname":4175,"affiliation":63,"orcid":63},"Piek","Vossen","With the continuous growth of benchmark corpora, which often annotate the same documents, there is a range of opportunities to compare and combine similar and complementary annotations. However, these opportunities are hampered by a wide range of problems that are related to the lack of resource interoperability. In this paper, we illustrate these problems by assessing aspects of interoperability at the document-level across a set of 20 corpora annotated with (aspects of) events. The issues range from applying different document naming conventions, to mismatches in textual content and structural\u002Fconceptual differences among annotation schemes. We provide insight into the exact document intersections between the corpora by mapping their document identifiers and perform an empirical analysis of event annotations showing their compatibility and consistency in and across the corpora. This way, we aim to make the community more aware of the challenges and opportunities and to inspire working collaboratively towards interoperable resources.",{"paper_id":4178,"title":4179,"year":81,"month":855,"day":63,"doi":4180,"resource_url":4181,"first_page":63,"last_page":63,"pdf_url":4182,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4183,"paper_type":860,"authors":4184,"abstract":4197},"lrec2018-main-179","Parsivar: A Language Processing Toolkit for Persian","10.63317\u002F5cmzfmbj8mef","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-179","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F908.pdf","mohtaj-etal-2018-parsivar",[4185,4188,4191,4194],{"paper_id":4178,"author_seq":247,"given_name":4186,"surname":4187,"affiliation":63,"orcid":63},"Salar","Mohtaj",{"paper_id":4178,"author_seq":232,"given_name":4189,"surname":4190,"affiliation":63,"orcid":63},"Behnam","Roshanfekr",{"paper_id":4178,"author_seq":218,"given_name":4192,"surname":4193,"affiliation":63,"orcid":63},"Atefeh","Zafarian",{"paper_id":4178,"author_seq":203,"given_name":4195,"surname":4196,"affiliation":63,"orcid":63},"Habibollah","Asghari","With the growth of Internet usage, a massive amount of textual data is generated on social media and the Web. As the text on the Web are generated by different authors with various types of writing styles and different encodings, a preprocessing step is required before applying any NLP task. The goal of preprocessing is to convert text into a standard format that makes it easy to extract information from documents and sentences. Moreover, the problem is more acute when we deal with Arabic script-based languages, in which there are some different kinds of encoding schemes, different kinds of writing styles and the spaces between or within the words. This paper introduces a preprocessing toolkit named as Parsivar, which is a comprehensive set of tools for Persian text preprocessing tasks. This toolkit performs various kinds of activities comprised of normalization, space correction, tokenization, stemming, parts of speech tagging and shallow parsing. To evaluate the performance of the proposed toolkit, both intrinsic and extrinsic approaches for evaluation have been applied. A Persian plagiarism detection system has been exploited as a downstream task for extrinsic evaluation of the proposed toolkit. The results have revealed that our toolkit outperforms the available Persian preprocessing toolkits by about 8 percent in terms of F1.",{"paper_id":4199,"title":4200,"year":81,"month":855,"day":63,"doi":4201,"resource_url":4202,"first_page":63,"last_page":63,"pdf_url":4203,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4204,"paper_type":860,"authors":4205,"abstract":4212},"lrec2018-main-180","Multilingual Word Segmentation: Training Many Language-Specific Tokenizers Smoothly Thanks to the Universal Dependencies Corpus","10.63317\u002F4fxwp8kcdm8b","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-180","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1072.pdf","moreau-vogel-2018-multilingual",[4206,4209],{"paper_id":4199,"author_seq":247,"given_name":4207,"surname":4208,"affiliation":63,"orcid":63},"Erwan","Moreau",{"paper_id":4199,"author_seq":232,"given_name":4210,"surname":4211,"affiliation":63,"orcid":63},"Carl","Vogel","This paper describes how a tokenizer can be trained from any dataset in the Universal Dependencies 2.1 corpus. A software tool, which relies on Elephant to perform the training, is also made available. Beyond providing the community with a large choice of language-specific tokenizers, we argue in this paper that: (1) tokenization should be considered as a supervised task; (2) language scalability requires a streamlined software engineering process across languages.",{"paper_id":4214,"title":4215,"year":81,"month":855,"day":63,"doi":4216,"resource_url":4217,"first_page":63,"last_page":63,"pdf_url":4218,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4219,"paper_type":860,"authors":4220,"abstract":4222},"lrec2018-main-181","Build Fast and Accurate Lemmatization for Arabic","10.63317\u002F2a5pftey6kdr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-181","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1079.pdf","mubarak-2018-build",[4221],{"paper_id":4214,"author_seq":247,"given_name":1151,"surname":1152,"affiliation":63,"orcid":63},"In this paper we describe the complexity of building a lemmatizer for Arabic which has a rich and complex morphology, and show some differences between lemmatization and surface stemming, i.e. removing prefixes and suffixes from words. We discuss the need for a fast and accurate lammatization to enhance Arabic Information Retrieval results. We also introduce a new dataset that can be used to test lemmatization accuracy, and an efficient lemmatization algorithm that outperforms state-of-the-art Arabic lemmatization in terms of accuracy and speed. We share the dataset and the code for research purposes.",{"paper_id":4224,"title":4225,"year":81,"month":855,"day":63,"doi":4226,"resource_url":4227,"first_page":63,"last_page":63,"pdf_url":4228,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4229,"paper_type":860,"authors":4230,"abstract":4242},"lrec2018-main-182","JESC: Japanese-English Subtitle Corpus","10.63317\u002F3jk23ju3goh6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-182","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F30.pdf","pryzant-etal-2018-jesc",[4231,4234,4237,4239],{"paper_id":4224,"author_seq":247,"given_name":4232,"surname":4233,"affiliation":63,"orcid":63},"Reid","Pryzant",{"paper_id":4224,"author_seq":232,"given_name":4235,"surname":4236,"affiliation":63,"orcid":63},"Youngjoo","Chung",{"paper_id":4224,"author_seq":218,"given_name":2383,"surname":4238,"affiliation":63,"orcid":63},"Jurafsky",{"paper_id":4224,"author_seq":203,"given_name":4240,"surname":4241,"affiliation":63,"orcid":63},"Denny","Britz","In this paper we describe the Japanese-English Subtitle Corpus (JESC). JESC is a large Japanese-English parallel corpus covering the underrepresented domain of conversational dialogue. It consists of more than 3.2 million examples, making it the largest freely available dataset of its kind. The corpus was assembled by crawling and aligning subtitles found on the web. The assembly process incorporates a number of novel preprocessing elements to ensure high monolingual fluency and accurate bilingual alignments. We summarize its contents and evaluate its quality using human experts and baseline machine translation (MT) systems.",{"paper_id":4244,"title":4245,"year":81,"month":855,"day":63,"doi":4246,"resource_url":4247,"first_page":63,"last_page":63,"pdf_url":4248,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4249,"paper_type":860,"authors":4250,"abstract":4268},"lrec2018-main-183","Building a Corpus for Personality-dependent Natural Language Understanding and Generation","10.63317\u002F3hj6ffmvw7uk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-183","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F31.pdf","ramos-etal-2018-building",[4251,4254,4257,4260,4263,4266],{"paper_id":4244,"author_seq":247,"given_name":4252,"surname":4253,"affiliation":63,"orcid":63},"Ricelli","Ramos",{"paper_id":4244,"author_seq":232,"given_name":4255,"surname":4256,"affiliation":63,"orcid":63},"Georges","Neto",{"paper_id":4244,"author_seq":218,"given_name":4258,"surname":4259,"affiliation":63,"orcid":63},"Barbara","Silva",{"paper_id":4244,"author_seq":203,"given_name":4261,"surname":4262,"affiliation":63,"orcid":63},"Danielle","Monteiro",{"paper_id":4244,"author_seq":188,"given_name":4264,"surname":4265,"affiliation":63,"orcid":63},"Ivandré","Paraboni",{"paper_id":4244,"author_seq":172,"given_name":3815,"surname":4267,"affiliation":63,"orcid":63},"Dias","The computational treatment of human personality - both for the recognition of personality traits from text and for the generation of text so as to reflect a particular set of traits - is central to the development of NLP applications. As a means to provide a basic resource for studies of this kind, this article describes the b5 corpus, a collection of controlled and free (non-topic specific) texts produced in different (e.g., referential or descriptive)  communicative tasks, and accompanied by inventories of personality of their authors and additional demographics. The present discussion is mainly focused on the various corpus components and on the data collection task itself, but preliminary results of personality recognition from text are presented in order to illustrate how the corpus data may be reused. The  b5 corpus aims to provide support for a wide range of NLP studies based on personality information and it is, to the best of our knowledge, the largest resource of this kind to be made available for research purposes in the Brazilian Portuguese language.",{"paper_id":4270,"title":4271,"year":81,"month":855,"day":63,"doi":4272,"resource_url":4273,"first_page":63,"last_page":63,"pdf_url":4274,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4275,"paper_type":860,"authors":4276,"abstract":4286},"lrec2018-main-184","Linguistic and Sociolinguistic Annotation of 17th Century Dutch Letters","10.63317\u002F5bwfn2bddkra","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-184","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F214.pdf","schraagen-etal-2018-linguistic",[4277,4280,4283],{"paper_id":4270,"author_seq":247,"given_name":4278,"surname":4279,"affiliation":63,"orcid":63},"Marijn","Schraagen",{"paper_id":4270,"author_seq":232,"given_name":4281,"surname":4282,"affiliation":63,"orcid":63},"Feike","Dietz",{"paper_id":4270,"author_seq":218,"given_name":4284,"surname":4285,"affiliation":63,"orcid":63},"Marjo","van Koppen","Developments in the Dutch language during the 17th century, part of the Early Modern period, form an active research topic in historical linguistics and literature. To enable automatic quantitative analysis, a corpus of letters by the 17th century Dutch author and politician P.C. Hooft is manually annotated with parts-of-speech, document segmentation and sociolinguistic metadata. The corpus is developed as part of the Nederlab online research portal, which is available through the CLARIN ERIC European research infrastructure. This paper discusses the design and evaluation of the annotation effort, as well as adding new annotations to an existing annotated corpus.",{"paper_id":4288,"title":4289,"year":81,"month":855,"day":63,"doi":4290,"resource_url":4291,"first_page":63,"last_page":63,"pdf_url":4292,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4293,"paper_type":860,"authors":4294,"abstract":4299},"lrec2018-main-185","Simplified Corpus with Core Vocabulary","10.63317\u002F2fm3o2dm56ux","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-185","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F281.pdf","maruyama-yamamoto-2018-simplified",[4295,4298],{"paper_id":4288,"author_seq":247,"given_name":4296,"surname":4297,"affiliation":63,"orcid":63},"Takumi","Maruyama",{"paper_id":4288,"author_seq":232,"given_name":2271,"surname":2272,"affiliation":63,"orcid":63},"We have constructed the simplified corpus for the Japanese language and selected the core vocabulary. The corpus has 50,000 manually simplified and aligned sentences. This corpus contains the original sentences, simplified sentences and English translation of the original sentences. It can be used for automatic text simplification as well as translating simple Japanese into English and vice-versa. The core vocabulary is restricted to 2,000 words where it is selected by accounting for several factors such as meaning preservation, variation, simplicity and the UniDic word segmentation criterion. We repeated the construction of the simplified corpus and, subsequently, updated the core vocabulary accordingly.  As a result, despite vocabulary restrictions, our corpus achieved high quality in grammaticality and meaning preservation. In addition to representing a wide range of expressions, the core vocabulary's limited number helped in showing similarities of expressions among simplified sentences. We believe that the same quality can be obtained by extending this corpus.",{"paper_id":4301,"title":4302,"year":81,"month":855,"day":63,"doi":4303,"resource_url":4304,"first_page":63,"last_page":63,"pdf_url":4305,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4306,"paper_type":860,"authors":4307,"abstract":4312},"lrec2018-main-186","A Pragmatic Approach for Classical Chinese Word Segmentation","10.63317\u002F3b4pkezpdv8c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-186","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F295.pdf","huang-wu-2018-pragmatic",[4308,4310],{"paper_id":4301,"author_seq":247,"given_name":4309,"surname":3506,"affiliation":63,"orcid":63},"Shilei",{"paper_id":4301,"author_seq":232,"given_name":4311,"surname":3703,"affiliation":63,"orcid":63},"Jiangqin","Word segmentation, a fundamental technology for lots of downstream applications, plays a significant role in Natural Language Processing, especially for those languages without explicit delimiters, like Chinese, Korean, Japanese and etc. Basically, word segmentation for modern Chinese is worked out to a certain extent. Nevertheless, Classical Chinese is largely neglected, mainly owing to its obsoleteness. One of the biggest problems for the researches of Classical Chinese word segmentation (CCWS) is lacking in standard large-scale shareable marked-up corpora, for the fact that the most excellent approaches, solving word segmentation, are based on machine learning or statistical methods which need quality-assured marked-up corpora. In this paper, we propose a pragmatic approach founded on the difference of t-score (dts) and Baidu Baike (the largest Chinese-language encyclopedia like Wikipedia) in order to deal with CCWS without any marked-up corpus. We extract candidate words as well as their corresponding frequency from the Twenty-Five Histories (Twenty-Four Histories and Draft History of Qing) to build a lexicon, and conduct segmentation experiments with it. The F-Score of our approach on the whole evaluation data set is 76.84%. Compared with traditional collocation-based methods, ours makes the segmentation more accurate.",{"paper_id":4314,"title":4315,"year":81,"month":855,"day":63,"doi":4316,"resource_url":4317,"first_page":63,"last_page":63,"pdf_url":4318,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4319,"paper_type":860,"authors":4320,"abstract":4325},"lrec2018-main-187","ASAP++: Enriching the ASAP Automated Essay Grading Dataset with Essay Attribute Scores","10.63317\u002F4pmfncubhk6c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-187","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F373.pdf","mathias-bhattacharyya-2018-asap",[4321,4324],{"paper_id":4314,"author_seq":247,"given_name":4322,"surname":4323,"affiliation":63,"orcid":63},"Sandeep","Mathias",{"paper_id":4314,"author_seq":232,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},"In this paper, we describe the creation of a resource - ASAP++ - which is basically annotations of the Automatic Student Assessment Prize’s Automatic Essay Grading dataset. These annotations are scores for different attributes of the essays, such as content, word choice, organization, sentence fluency, etc. Each of these essays is scored by an annotator. We also report the results of each of the attributes using a Random Forest Classifier using a baseline set of task independent features. We release and share this resource to facilitate further research into these attributes of essay grading.",{"paper_id":4327,"title":4328,"year":81,"month":855,"day":63,"doi":4329,"resource_url":4330,"first_page":63,"last_page":63,"pdf_url":4331,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4332,"paper_type":860,"authors":4333,"abstract":4346},"lrec2018-main-188","MirasText: An Automatically Generated Text Corpus for Persian","10.63317\u002F4mhrs6r7ch29","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-188","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F385.pdf","sabeti-etal-2018-mirastext",[4334,4336,4339,4341,4344],{"paper_id":4327,"author_seq":247,"given_name":4189,"surname":4335,"affiliation":63,"orcid":63},"Sabeti",{"paper_id":4327,"author_seq":232,"given_name":4337,"surname":4338,"affiliation":63,"orcid":63},"Hossein","Abedi Firouzjaee",{"paper_id":4327,"author_seq":218,"given_name":2863,"surname":4340,"affiliation":63,"orcid":63},"Janalizadeh Choobbasti",{"paper_id":4327,"author_seq":203,"given_name":4342,"surname":4343,"affiliation":63,"orcid":63},"S.H.E.","Mortazavi Najafabadi",{"paper_id":4327,"author_seq":188,"given_name":1797,"surname":4345,"affiliation":63,"orcid":63},"Vaheb","Natural Language Processing is one of the most important fields of artificial intelligence. The rapid growth of digital content has made this field both practical and challenging at the same time. As opposed to less-resourced languages like Persian, there are several text corpora in dominant languages like English which can be used for NLP applications. \\\\In this paper, MirasText which is an automatically generated text corpus for Persian language is presented. In this study, over 250 Persian websites were crawled and several fields like content, description, keywords, title, etc have been extracted to generate MirasText. Topic modeling and language modeling are used to validate the generated corpus. MirasText has over 2.8 million documents and over 1.4 billion tokens, which to our knowledge is the largest Persian corpus currently available.",{"paper_id":4348,"title":4349,"year":81,"month":855,"day":63,"doi":4350,"resource_url":4351,"first_page":63,"last_page":63,"pdf_url":4352,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4353,"paper_type":860,"authors":4354,"abstract":4362},"lrec2018-main-189","The Reference Corpus of the Contemporary Romanian Language (CoRoLa)","10.63317\u002F3smf35cw29fa","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-189","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F423.pdf","barbu-mititelu-etal-2018-reference",[4355,4358,4360],{"paper_id":4348,"author_seq":247,"given_name":4356,"surname":4357,"affiliation":63,"orcid":63},"Verginica","Barbu Mititelu",{"paper_id":4348,"author_seq":232,"given_name":2383,"surname":4359,"affiliation":63,"orcid":63},"Tufiș",{"paper_id":4348,"author_seq":218,"given_name":3684,"surname":4361,"affiliation":63,"orcid":63},"Irimia","We present here the largest publicly available corpus of Romanian. Its written component contains 1,257,752,812 tokens, distributed, in an unbalanced way, in several language styles (legal, administrative, scientific, journalistic, imaginative, memoirs, blogposts), in four domains (arts and culture, nature, society, science) and in 71 subdomains. The oral component consists of almost 152 hours of recordings, with associated transcribed texts. All files have CMDI metadata associated. The written texts are automatically sentence-split, tokenized, part-of-speech tagged, lemmatized; a part of them are also syntactically annotated. The oral files are aligned with their corresponding transcriptions at word-phoneme level. The transcriptions are also automatically part-of-speech tagged, lemmatised and syllabified. CoRoLa contains original, IPR-cleared texts and is representative for the contemporary phase of the language, covering mostly the last 20 years. Its written component can be queried using the KorAP corpus management platform, whereas the oral component can be queried via its written counterpart, followed by the possibility of listening to the results of the query, using an in-house tool.",{"paper_id":4364,"title":4365,"year":81,"month":855,"day":63,"doi":4366,"resource_url":4367,"first_page":63,"last_page":63,"pdf_url":4368,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4369,"paper_type":860,"authors":4370,"abstract":4380},"lrec2018-main-190","A Corpus of Drug Usage Guidelines Annotated with Type of Advice","10.63317\u002F4uptpgmvhtj4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-190","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F426.pdf","preum-etal-2018-corpus",[4371,4374,4377,4378],{"paper_id":4364,"author_seq":247,"given_name":4372,"surname":4373,"affiliation":63,"orcid":63},"Sarah Masud","Preum",{"paper_id":4364,"author_seq":232,"given_name":4375,"surname":4376,"affiliation":63,"orcid":63},"Md. Rizwan","Parvez",{"paper_id":4364,"author_seq":218,"given_name":2068,"surname":2069,"affiliation":63,"orcid":63},{"paper_id":4364,"author_seq":203,"given_name":1734,"surname":4379,"affiliation":63,"orcid":63},"Stankovic","Adherence to drug usage guidelines for prescription and over-the-counter drugs is critical for drug safety and effectiveness of treatment. Drug usage guideline documents contain advice on potential drug-drug interaction, drug-food interaction, and drug administration process. Current research on drug safety and public health indicates patients are often either unaware of such critical advice or overlook them. Categorizing advice statements from these documents according to their topics can enable the patients to find safety critical information. However, automatically categorizing drug usage guidelines based on their topic is an open challenge and there is no annotated dataset on drug usage guidelines. To address the latter issue, this paper presents (i) an annotation scheme for annotating safety critical advice from drug usage guidelines, (ii) an annotation tool for such data, and (iii) an annotated dataset containing drug usage guidelines from 90 drugs. This work is expected to accelerate further release of annotated drug usage guideline datasets and research on automatically filtering safety critical information from these textual documents.",{"paper_id":4382,"title":4383,"year":81,"month":855,"day":63,"doi":4384,"resource_url":4385,"first_page":63,"last_page":63,"pdf_url":4386,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4387,"paper_type":860,"authors":4388,"abstract":4393},"lrec2018-main-191","BioRo: The Biomedical Corpus for the Romanian Language","10.63317\u002F4ausa5wkobgd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-191","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F424.pdf","mitrofan-tufis-2018-bioro",[4389,4391],{"paper_id":4382,"author_seq":247,"given_name":2301,"surname":4390,"affiliation":63,"orcid":63},"Mitrofan",{"paper_id":4382,"author_seq":232,"given_name":2383,"surname":4392,"affiliation":63,"orcid":63},"Tufiş","The biomedical domain provides a large amount of linguistic resources usable for biomedical text mining. While most of the resources used in biomedical Natural Language Processing are available for English, for other languages including Romanian the access to language resources is not straight-forward. In this paper, we present the biomedical corpus of the Romanian language, which is a valuable linguistic asset for biomedical text mining. This corpus was collected in the contexts of CoRoLa project, the reference corpus for the contemporary Romanian language. We also provide informative statistics about the corpus, a description of the data-composition. The annotation process of the corpus is also presented. Furthermore, we present the fraction of the corpus which will be made publicly available to the community without copyright restrictions.",{"paper_id":4395,"title":4396,"year":81,"month":855,"day":63,"doi":4397,"resource_url":4398,"first_page":63,"last_page":63,"pdf_url":4399,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4400,"paper_type":860,"authors":4401,"abstract":4410},"lrec2018-main-192","A Comparison Of Emotion Annotation Schemes And A New Annotated Data Set","10.63317\u002F3v7m3awcxn4a","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-192","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F61.pdf","wood-etal-2018-comparison",[4402,4405,4406,4409],{"paper_id":4395,"author_seq":247,"given_name":4403,"surname":4404,"affiliation":63,"orcid":63},"Ian D.","Wood",{"paper_id":4395,"author_seq":232,"given_name":3687,"surname":3688,"affiliation":63,"orcid":63},{"paper_id":4395,"author_seq":218,"given_name":4407,"surname":4408,"affiliation":63,"orcid":63},"Vladimir","Andryushechkin",{"paper_id":4395,"author_seq":203,"given_name":3690,"surname":3691,"affiliation":63,"orcid":63},"While the recognition of positive\u002Fnegative sentiment in text is an established task with many standard data sets and well developed methodologies, the recognition of more nuanced affect has received less attention, and in particular, there are very few publicly available gold standard annotated resources. To address this lack, we present a series of emotion annotation studies on tweets culminating in a publicly available collection of 2,019 tweets with scores on four emotion dimensions: valence, arousal, dominance and surprise, following the emotion representation model identified by Fontaine et.al. (Fontaine et al., 2007). Further, we make a comparison of relative vs. absolute annotation schemes. We find improved annotator agreement with a relative annotation scheme (comparisons) on a dimensional emotion model over a categorical annotation scheme on Ekman’s six basic emotions (Ekman et al., 1987), however when we compare inter-annotator agreement for comparisons with agreement for a rating scale annotation scheme (both with the same dimensional emotion model), we find improved inter-annotator agreement with rating scales, challenging a common belief that relative judgements are more reliable.",{"paper_id":4412,"title":4413,"year":81,"month":855,"day":63,"doi":4414,"resource_url":4415,"first_page":63,"last_page":63,"pdf_url":4416,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4417,"paper_type":860,"authors":4418,"abstract":4431},"lrec2018-main-193","Humor Detection in English-Hindi Code-Mixed Social Media Content : Corpus and Baseline System","10.63317\u002F37xx573x4anc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-193","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F363.pdf","khandelwal-etal-2018-humor",[4419,4422,4425,4428],{"paper_id":4412,"author_seq":247,"given_name":4420,"surname":4421,"affiliation":63,"orcid":63},"Ankush","Khandelwal",{"paper_id":4412,"author_seq":232,"given_name":4423,"surname":4424,"affiliation":63,"orcid":63},"Sahil","Swami",{"paper_id":4412,"author_seq":218,"given_name":4426,"surname":4427,"affiliation":63,"orcid":63},"Syed S.","Akhtar",{"paper_id":4412,"author_seq":203,"given_name":4429,"surname":4430,"affiliation":63,"orcid":63},"Manish","Shrivastava","The tremendous amount of user generated data through social networking sites led to the gaining popularity of automatic text classification in the field of computational linguistics over the past decade. Within this domain, one problem that has drawn the attention of many researchers is automatic humor detection in texts. In depth semantic understanding of the text is required to detect humor which makes the problem difficult to automate. With increase in the number of social media users, many multilingual speakers often interchange between languages while posting on social media which is called code-mixing. It introduces some challenges in the field of linguistic analysis of social media content (Barman et al., 2014), like spelling variations and non-grammatical structures in a sentence. Past researches include detecting puns in texts (Kao et al., 2016) and humor in one-lines (Mihalcea et al., 2010) in a single language, but with the tremendous amount of code-mixed data available online, there is a need to develop techniques which detects humor in code-mixed tweets. In this paper, we analyze the task of humor detection in texts and describe a freely available corpus containing English-Hindi code-mixed tweets annotated with humorous(H) or non-humorous(N) tags. We also tagged the words in the tweets with Language tags (English\u002FHindi\u002FOthers). Moreover, we provide a baseline classification system which distinguishes between humorous and non-humorous texts.",{"paper_id":4433,"title":4434,"year":81,"month":855,"day":63,"doi":4435,"resource_url":4436,"first_page":63,"last_page":63,"pdf_url":4437,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4438,"paper_type":860,"authors":4439,"abstract":4456},"lrec2018-main-194","Dialogue Scenario Collection of Persuasive Dialogue with Emotional Expressions via Crowdsourcing","10.63317\u002F4wt42u7kr9p2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-194","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F462.pdf","yoshino-etal-2018-dialogue",[4440,4443,4446,4449,4450,4453],{"paper_id":4433,"author_seq":247,"given_name":4441,"surname":4442,"affiliation":63,"orcid":63},"Koichiro","Yoshino",{"paper_id":4433,"author_seq":232,"given_name":4444,"surname":4445,"affiliation":63,"orcid":63},"Yoko","Ishikawa",{"paper_id":4433,"author_seq":218,"given_name":4447,"surname":4448,"affiliation":63,"orcid":63},"Masahiro","Mizukami",{"paper_id":4433,"author_seq":203,"given_name":2406,"surname":3907,"affiliation":63,"orcid":63},{"paper_id":4433,"author_seq":188,"given_name":4451,"surname":4452,"affiliation":63,"orcid":63},"Sakriani","Sakti",{"paper_id":4433,"author_seq":172,"given_name":4454,"surname":4455,"affiliation":63,"orcid":63},"Satoshi","Nakamura","Existing dialogue data collection methods such as the Wizard of Oz method (WoZ) or real dialogue recording are costly, and they prevent launching a new dialogue system. In this study, we requested crowd workers in crowdsourcing to create dialogue scenarios according to the instruction of the situation for persuasive dialogue systems that use emotional expressions. We collected 200 dialogues in 5 scenarios for a total of 1,000 via crowdsourcing. We also annotated emotional states and users' acceptance for system persuasion by using crowdsourcing. We constructed a persuasive dialogue system with the collected data and evaluated the system by interacting with crowd works. From the experiment, it was investigated that the collected labels have sufficient agreement even if we did not impose any training of annotation to workers.",{"paper_id":4458,"title":4459,"year":81,"month":855,"day":63,"doi":4460,"resource_url":4461,"first_page":63,"last_page":63,"pdf_url":4462,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4463,"paper_type":860,"authors":4464,"abstract":4468},"lrec2018-main-195","SentiArabic: A Sentiment Analyzer for Standard Arabic","10.63317\u002F3matbrdtj2yp","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-195","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F883.pdf","eskander-2018-sentiarabic",[4465],{"paper_id":4458,"author_seq":247,"given_name":4466,"surname":4467,"affiliation":63,"orcid":63},"Ramy","Eskander","Sentiment analysis has been receiving increasing interest as it conveys valuable information in regard to people’s preferences and opinions. In this work, we present a sentiment analyzer that identifies the overall contextual polarity for Standard Arabic text. The contribution of this work is threefold. First, we modify and extend SLSA; a large-scale Sentiment Lexicon for Standard Arabic. Second, we build a sentiment corpus of Standard Arabic text tagged for its contextual polarity. This corpus represents the training, development and test sets for the proposed system. Third, we build a lightweight lexicon-based sentiment analyzer for Standard Arabic (SentiArabic). The analyzer does not require running heavy computations, where the link to the lexicon is carried out through a morphological lookup as opposed to conducting a rich morphological analysis, while the assignment of the sentiment is based on a simple decision tree that uses polarity scores as opposed to a more complex machine learning approach that relies on lexical information, while negation receives special handling. The analyzer is highly efficient as it achieves an F-score of 76.5% when evaluated on a blind test set, which is the highest results reported for that set, and an absolute 3.0% increase over a state-of-the-art system that uses deep-learning models.",{"paper_id":4470,"title":4471,"year":81,"month":855,"day":63,"doi":4472,"resource_url":4473,"first_page":63,"last_page":63,"pdf_url":4474,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4475,"paper_type":860,"authors":4476,"abstract":4486},"lrec2018-main-196","Contextual Dependencies in Time-Continuous Multidimensional Affect Recognition","10.63317\u002F4uhctivpsh2y","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-196","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F923.pdf","fedotov-etal-2018-contextual",[4477,4480,4482,4485],{"paper_id":4470,"author_seq":247,"given_name":4478,"surname":4479,"affiliation":63,"orcid":63},"Dmitrii","Fedotov",{"paper_id":4470,"author_seq":232,"given_name":3946,"surname":4481,"affiliation":63,"orcid":63},"Ivanko",{"paper_id":4470,"author_seq":218,"given_name":4483,"surname":4484,"affiliation":63,"orcid":63},"Maxim","Sidorov",{"paper_id":4470,"author_seq":203,"given_name":1251,"surname":1252,"affiliation":63,"orcid":63},"Modern research on emotion recognition often deals with time-continuously labelled spontaneous interactions. Such data is much closer to real world problems in contrast to utterance-level categorical labelling in acted emotion corpora that have widely been used to date. While working with time-continuous labelling, one usually uses context-aware models, such as recurrent neural networks. The amount of context needed to show the best performance should be defined in this case. Despite of the research done in this field there is still no agreement on this issue. In this paper we model different amounts of contextual input data by varying two parameters: sparsing coefficient and time window size. A series of experiments conducted with different modalities and emotional labels on the RECOLA corpora has shown a strong pattern between the amount of context used in model and performance. The pattern remains the same for different pairs of modalities and label dimensions, but the intensity differs. Knowledge about an appropriate context can significantly reduce the complexity of the model and increase its flexibility.",{"paper_id":4488,"title":4489,"year":81,"month":855,"day":63,"doi":4490,"resource_url":4491,"first_page":63,"last_page":63,"pdf_url":4492,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4493,"paper_type":860,"authors":4494,"abstract":4497},"lrec2018-main-197","WikiArt Emotions: An Annotated Dataset of Emotions Evoked by Art","10.63317\u002F33oybkwsfa8r","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-197","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F966.pdf","mohammad-kiritchenko-2018-wikiart",[4495,4496],{"paper_id":4488,"author_seq":247,"given_name":1436,"surname":1437,"affiliation":63,"orcid":63},{"paper_id":4488,"author_seq":232,"given_name":1490,"surname":1491,"affiliation":63,"orcid":63},"Art is imaginative human creation meant to be appreciated, make people think, and evoke an emotional response. Here for the first time, we create a dataset of more than 4,000 pieces of art (mostly paintings) that has annotations for emotions evoked in the observer. The pieces of art are selected from WikiArt.org's collection for four western styles (Renaissance Art, Post-Renaissance Art, Modern Art, and Contemporary Art). The art is annotated via crowdsourcing for one or more of twenty emotion categories (including neutral). In addition to emotions, the art is also annotated for whether it includes the depiction of a face and how much the observers like the art. The dataset, which we refer to as the {\\it WikiArt Emotions Dataset}, can help answer several compelling questions, such as:  what makes art evocative, how does art convey different emotions, what attributes of a painting make it well liked, what combinations of categories and emotions evoke strong emotional response, how much does the title of an art impact its emotional response, and what is the extent to which different categories of art evoke consistent emotions in people. We found that fear, happiness, love, and sadness were the dominant emotions that also obtained consistent annotations among the different annotators. We found that the title often impacts the affectual response to art. We show that pieces of art that depict faces draw more consistent emotional responses than those that do not. We also show, for each art category and emotion combination, the average agreements on the emotions evoked and the average art ratings. The WikiArt Emotions dataset also has applications in automatic image processing, as it can be used to develop systems that detect emotions evoked by art, and systems that can transform existing art (or even generate new art) that evokes the desired affectual response.",{"paper_id":4499,"title":4500,"year":81,"month":855,"day":63,"doi":4501,"resource_url":4502,"first_page":63,"last_page":63,"pdf_url":4503,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4504,"paper_type":860,"authors":4505,"abstract":4520},"lrec2018-main-198","Arabic Data Science Toolkit: An API for Arabic Language Feature Extraction","10.63317\u002F2qcsskyrb6m5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-198","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F998.pdf","rodrigues-etal-2018-arabic",[4506,4508,4511,4514,4517],{"paper_id":4499,"author_seq":247,"given_name":3690,"surname":4507,"affiliation":63,"orcid":63},"Rodrigues",{"paper_id":4499,"author_seq":232,"given_name":4509,"surname":4510,"affiliation":63,"orcid":63},"Valerie","Novak",{"paper_id":4499,"author_seq":218,"given_name":4512,"surname":4513,"affiliation":63,"orcid":63},"C. Anton","Rytting",{"paper_id":4499,"author_seq":203,"given_name":4515,"surname":4516,"affiliation":63,"orcid":63},"Julie","Yelle",{"paper_id":4499,"author_seq":188,"given_name":4518,"surname":4519,"affiliation":63,"orcid":63},"Jennifer","Boutz","We introduce Arabic Data Science Toolkit (ADST), a framework for Arabic language feature extraction, designed for data scientists that may not be familiar with Arabic or natural language processing. The functions in the toolkit allow data scientists to extend their algorithms beyond lexical or statistical methods and leverage Arabic-specific linguistic and stylistic features to enhance their systems and enable them to reach performance levels they might receive on languages with more resources, or languages with which they have more familiarity.",{"paper_id":4522,"title":4523,"year":81,"month":855,"day":63,"doi":4524,"resource_url":4525,"first_page":63,"last_page":63,"pdf_url":4526,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4527,"paper_type":860,"authors":4528,"abstract":4533},"lrec2018-main-199","Sentence and Clause Level Emotion Annotation, Detection, and Classification in a Multi-Genre Corpus","10.63317\u002F3mnuyc2fn9ah","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-199","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1065.pdf","tafreshi-diab-2018-sentence",[4529,4532],{"paper_id":4522,"author_seq":247,"given_name":4530,"surname":4531,"affiliation":63,"orcid":63},"Shabnam","Tafreshi",{"paper_id":4522,"author_seq":232,"given_name":4093,"surname":4094,"affiliation":63,"orcid":63},"Predicting emotion categories (e.g. anger, joy, sadness) expressed by a sentence is challenging due to inherent multi-label smaller pieces such as phrases and clauses. To date, emotion has been studied in single genre, while models of human behaviors or situational awareness in the event of disasters require emotion modeling in multi-genres. In this paper, we expand and unify existing annotated data in different genres (emotional blog post, news title, and movie reviews) using an inventory of 8 emotions from Plutchik's Wheel of Emotions tags. We develop systems for automatically detecting and classifying emotions in text, in different textual genres and granularity levels, namely, sentence and clause levels in a supervised setting. We explore the effectiveness of clause annotation in sentence-level emotion detection and classification (EDC). To our knowledge, our EDC system is the first to target the clause level; further we provide emotion annotation for movie reviews dataset for the first time.",{"paper_id":4535,"title":4536,"year":81,"month":855,"day":63,"doi":4537,"resource_url":4538,"first_page":63,"last_page":63,"pdf_url":4539,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4540,"paper_type":860,"authors":4541,"abstract":4552},"lrec2018-main-200","A Swedish Cookie-Theft Corpus","10.63317\u002F2e42uqpk9g23","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-200","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F307.pdf","kokkinakis-etal-2018-swedish",[4542,4545,4547,4549],{"paper_id":4535,"author_seq":247,"given_name":4543,"surname":4544,"affiliation":63,"orcid":63},"Dimitrios","Kokkinakis",{"paper_id":4535,"author_seq":232,"given_name":3379,"surname":4546,"affiliation":63,"orcid":63},"Lundholm Fors",{"paper_id":4535,"author_seq":218,"given_name":3867,"surname":4548,"affiliation":63,"orcid":63},"Fraser",{"paper_id":4535,"author_seq":203,"given_name":4550,"surname":4551,"affiliation":63,"orcid":63},"Arto","Nordlund","Language disturbances can be a diagnostic marker for neurodegenerative diseases, such as Alzheimer’s disease, at earlier stages. Connected speech analysis provides a non-invasive and easy-to-assess measure for determining aspects of the severity of language impairment. In this paper we focus on the development of a new corpus consisting of audio recordings of picture descriptions (including transcriptions) of the Cookie-theft, produced by Swedish speakers. The speech elicitation procedure provides an established method of obtaining highly constrained samples of connected speech that can allow us to study the intricate interactions between various linguistic levels and cognition. We chose the Cookie-theft picture since it’s a standardized test that has been used in various studies in the past, and therefore comparisons can be made based on previous research. This type of picture description task might be useful for detecting subtle language deficits in patients with subjective and mild cognitive impairment. The resulting corpus is a new, rich and multi-faceted resource for the investigation of linguistic characteristics of connected speech and a unique dataset that provides a rich resource for (future) research and experimentation in many areas, and of language impairment in particular. The information in the corpus can also be combined and correlated with other collected data about the speakers, such as neuropsychological tests, brain physiology and cerebrospinal fluid markers as well as imaging.",{"paper_id":4554,"title":4555,"year":81,"month":855,"day":63,"doi":4556,"resource_url":4557,"first_page":63,"last_page":63,"pdf_url":4558,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4559,"paper_type":860,"authors":4560,"abstract":4566},"lrec2018-main-201","Sharing Copies of Synthetic Clinical Corpora without Physical Distribution — A Case Study to Get Around IPRs and Privacy Constraints Featuring the German JSYNCC Corpus","10.63317\u002F3oo2qvfendx8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-201","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F701.pdf","lohr-etal-2018-sharing",[4561,4564,4565],{"paper_id":4554,"author_seq":247,"given_name":4562,"surname":4563,"affiliation":63,"orcid":63},"Christina","Lohr",{"paper_id":4554,"author_seq":232,"given_name":1448,"surname":1449,"affiliation":63,"orcid":63},{"paper_id":4554,"author_seq":218,"given_name":1451,"surname":1452,"affiliation":63,"orcid":63},"The legal culture in the European Union imposes almost unsurmountable hurdles to exploit copyright protected language data (in terms of intellectual property rights (IPRs) of media contents) and privacy protected medical health data (in terms of the notion of informational self-determination) as language resources for the NLP community.  These juridical constraints have seriously hampered progress in resource-greedy NLP research, in particular for non-English languages in the clinical domain. In order to get around these restrictions, we introduce a novel approach for the creation and re-use of clinical corpora which is based on a two-step workflow. First, we substitute authentic clinical documents by synthetic ones, i.e., made-up reports and case studies written by medical professionals for educational purposes and published in medical e-textbooks. We thus eliminate patients' privacy concerns since no real, concrete individuals are addressed in such narratives.  In a second step, we replace physical corpus distribution by sharing software for trustful re-construction of corpus copies. This is achieved by an end-to-end tool suite which extracts well-specified text fragments from e-books and assembles, on demand, identical copies of the same text corpus we defined at our lab at any other site where this software is executed. Thus, we avoid IPR violations since no physical corpus (raw text data) is distributed. As an illustrative case study which is easily portable to other languages we present JSYNCC, the largest and, even more importantly, first publicly available, corpus of German clinical language.",{"paper_id":4568,"title":4569,"year":81,"month":855,"day":63,"doi":4570,"resource_url":4571,"first_page":63,"last_page":63,"pdf_url":4572,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4573,"paper_type":860,"authors":4574,"abstract":4587},"lrec2018-main-202","A Legal Perspective on Training Models for Natural Language Processing","10.63317\u002F4fatnhrwbrsb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-202","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1006.pdf","eckart-de-castilho-etal-2018-legal",[4575,4577,4579,4581,4584],{"paper_id":4568,"author_seq":247,"given_name":1070,"surname":4576,"affiliation":63,"orcid":63},"Eckart de Castilho",{"paper_id":4568,"author_seq":232,"given_name":2996,"surname":4578,"affiliation":63,"orcid":63},"Dore",{"paper_id":4568,"author_seq":218,"given_name":1615,"surname":4580,"affiliation":63,"orcid":63},"Margoni",{"paper_id":4568,"author_seq":203,"given_name":4582,"surname":4583,"affiliation":63,"orcid":63},"Penny","Labropoulou",{"paper_id":4568,"author_seq":188,"given_name":4585,"surname":4586,"affiliation":63,"orcid":63},"Iryna","Gurevych","A significant concern in processing natural language data is the often unclear legal status of the input and output data\u002Fresources. In this paper, we investigate this problem by discussing a typical activity in Natural Language Processing: the training of a machine learning model from an annotated corpus. We examine which legal rules apply at relevant steps and how they affect the legal status of the results, especially in terms of copyright and copyright-related rights.",{"paper_id":4589,"title":4590,"year":81,"month":855,"day":63,"doi":4591,"resource_url":4592,"first_page":63,"last_page":63,"pdf_url":4593,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4594,"paper_type":860,"authors":4595,"abstract":4604},"lrec2018-main-203","LREMap, a Song of Resources and Evaluation","10.63317\u002F56rjafkpwavx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-203","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F300.pdf","del-gratta-etal-2018-lremap",[4596,4599,4600,4601],{"paper_id":4589,"author_seq":247,"given_name":4597,"surname":4598,"affiliation":63,"orcid":63},"Riccardo","Del Gratta",{"paper_id":4589,"author_seq":232,"given_name":2650,"surname":2651,"affiliation":63,"orcid":63},{"paper_id":4589,"author_seq":218,"given_name":2656,"surname":2657,"affiliation":63,"orcid":63},{"paper_id":4589,"author_seq":203,"given_name":4602,"surname":4603,"affiliation":63,"orcid":63},"Nicoletta","Calzolari","After 8 years we revisit the LRE Map of Language Resources, introduced at LREC 2010, to try to get a picture of the field and its evolution as reflected by the creation and use of Language Resources. The purpose of the Map was in fact “to shed light on the vast amount of resources that represent the background of the research presented at LREC”. It also aimed at a “change of culture in the field, actively engaging each researcher in the documentation task about resources”. The data analysed here have been provided by the authors of several conferences during the phase of submission of papers, and contain information about ca. 7500 resources. We analysed the LRE Map data from many different viewpoints and the paper reports on the global picture, on different trends emerging from the diachronic perspective and finally on some comparisons between the 2 major conferences present in the Map: LREC and COLING.",{"paper_id":4606,"title":4607,"year":81,"month":855,"day":63,"doi":4608,"resource_url":4609,"first_page":63,"last_page":63,"pdf_url":4610,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4611,"paper_type":860,"authors":4612,"abstract":4622},"lrec2018-main-204","Metadata Collection Records for Language Resources","10.63317\u002F4epu9r5m242r","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-204","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F336.pdf","van-den-heuvel-etal-2018-metadata",[4613,4616,4619],{"paper_id":4606,"author_seq":247,"given_name":4614,"surname":4615,"affiliation":63,"orcid":63},"Henk","van den Heuvel",{"paper_id":4606,"author_seq":232,"given_name":4617,"surname":4618,"affiliation":63,"orcid":63},"Erwin","Komen",{"paper_id":4606,"author_seq":218,"given_name":4620,"surname":4621,"affiliation":63,"orcid":63},"Nelleke","Oostdijk","In this paper we motivate the need for introducing more elaborate and consistent metadata records for collections of (linguistic) data resources in the CLARIN context. For this purpose we designed and implemented a CMDI profile. We validated the profile in a first pilot in which we populated the profile for 45 Dutch language resources. Given the complexity of the profile and special purpose requirements we developed our own interface for creating, editing, listing, copying and exporting descriptions of metadata collection records. The requirements for this interface and its implementation are described.",{"paper_id":4624,"title":4625,"year":81,"month":855,"day":63,"doi":4626,"resource_url":4627,"first_page":63,"last_page":63,"pdf_url":4628,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4629,"paper_type":860,"authors":4630,"abstract":4640},"lrec2018-main-205","Managing Public Sector Data for Multilingual Applications Development","10.63317\u002F4prjgjzqf657","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-205","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F648.pdf","piperidis-etal-2018-managing",[4631,4634,4635,4638],{"paper_id":4624,"author_seq":247,"given_name":4632,"surname":4633,"affiliation":63,"orcid":63},"Stelios","Piperidis",{"paper_id":4624,"author_seq":232,"given_name":4582,"surname":4583,"affiliation":63,"orcid":63},{"paper_id":4624,"author_seq":218,"given_name":4636,"surname":4637,"affiliation":63,"orcid":63},"Miltos","Deligiannis",{"paper_id":4624,"author_seq":203,"given_name":2301,"surname":4639,"affiliation":63,"orcid":63},"Giagkou","The current paper outlines the ELRC-SHARE repository, an infrastructure designed and developed in the framework of the European Language Resource Coordination action with the objective to host, document, manage and appropriately distribute language resources pertinent to machine translation, and specifically tailored to the needs of the eTranslation service of the European Commission. Due to the scope of the eTranslation service which seeks to facilitate multilingual communication across public administrations in 30 European countries and to enable Europe-wide multilingual digital services, ELRC-SHARE demonstrates a number of characteristics in terms of its technical and functional parameters, as well as in terms of its data management and documentation layers. The paper elaborates on the repository technical characteristics, the underlying metadata schema, the different ways in which data and metadata can be provided, the user roles and their respective permissions on data management, and, finally, the extensions currently being implemented.",{"paper_id":4642,"title":4643,"year":81,"month":855,"day":63,"doi":4644,"resource_url":4645,"first_page":63,"last_page":63,"pdf_url":4646,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4647,"paper_type":860,"authors":4648,"abstract":4675},"lrec2018-main-206","Bridging the LAPPS Grid and CLARIN","10.63317\u002F59ndrhjyf75s","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-206","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F662.pdf","hinrichs-etal-2018-bridging",[4649,4652,4653,4654,4655,4657,4660,4663,4666,4669,4672],{"paper_id":4642,"author_seq":247,"given_name":4650,"surname":4651,"affiliation":63,"orcid":63},"Erhard","Hinrichs",{"paper_id":4642,"author_seq":232,"given_name":1400,"surname":1401,"affiliation":63,"orcid":63},{"paper_id":4642,"author_seq":218,"given_name":1016,"surname":1017,"affiliation":63,"orcid":63},{"paper_id":4642,"author_seq":203,"given_name":2633,"surname":3462,"affiliation":63,"orcid":63},{"paper_id":4642,"author_seq":188,"given_name":4656,"surname":4651,"affiliation":63,"orcid":63},"Marie",{"paper_id":4642,"author_seq":172,"given_name":4658,"surname":4659,"affiliation":63,"orcid":63},"Mohammad Fazleh","Elahi",{"paper_id":4642,"author_seq":155,"given_name":4661,"surname":4662,"affiliation":63,"orcid":63},"Keith","Suderman",{"paper_id":4642,"author_seq":138,"given_name":4664,"surname":4665,"affiliation":63,"orcid":63},"Marc","Verhagen",{"paper_id":4642,"author_seq":121,"given_name":4667,"surname":4668,"affiliation":63,"orcid":63},"Kyeongmin","Rim",{"paper_id":4642,"author_seq":104,"given_name":4670,"surname":4671,"affiliation":63,"orcid":63},"Pavel","Straňák",{"paper_id":4642,"author_seq":87,"given_name":4673,"surname":4674,"affiliation":63,"orcid":63},"Jozef","Mišutka","The LAPPS-CLARIN project is creating a “trust network” between the Language Applications (LAPPS) Grid and the WebLicht workflow engine hosted by the CLARIN-D Center in Tubingen. The project also includes integration of NLP services available from the LINDAT\u002FCLARIN Center in Prague. The goal is to allow users on one side of the bridge to gain appropriately authenticated access to the other and enable seamless communication among tools and resources in both frameworks. The resulting “meta-framework” provides users across the globe with access to an unprecedented array of language processing facilities that cover multiple languages, tasks, and applications, all of which are fully interoperable.",{"paper_id":4677,"title":4678,"year":81,"month":855,"day":63,"doi":4679,"resource_url":4680,"first_page":63,"last_page":63,"pdf_url":4681,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4682,"paper_type":860,"authors":4683,"abstract":4695},"lrec2018-main-207","Fluid Annotation: A Granularity-aware Annotation Tool for Chinese Word Fluidity","10.63317\u002F33ojin3zjnuh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-207","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F716.pdf","hsieh-etal-2018-fluid",[4684,4687,4690,4692],{"paper_id":4677,"author_seq":247,"given_name":4685,"surname":4686,"affiliation":63,"orcid":63},"Shu-Kai","Hsieh",{"paper_id":4677,"author_seq":232,"given_name":4688,"surname":4689,"affiliation":63,"orcid":63},"Yu-Hsiang","Tseng",{"paper_id":4677,"author_seq":218,"given_name":4691,"surname":4032,"affiliation":63,"orcid":63},"Chih-Yao",{"paper_id":4677,"author_seq":203,"given_name":4693,"surname":4694,"affiliation":63,"orcid":63},"Chiung-Yu","Chiang","This paper presents a novel word granularity-aware annotation framework for Chinese. Anchored in current functionalist linguistics, this model rearranges the boundary of word segmentation and linguistic annotation, and gears toward a deeper understanding of lexical units and their behavior. The web-based annotation UI also supports flexible annotation tasks for various linguistic and affective phenomena.",{"paper_id":4697,"title":4698,"year":81,"month":855,"day":63,"doi":4699,"resource_url":4700,"first_page":63,"last_page":63,"pdf_url":4701,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4702,"paper_type":860,"authors":4703,"abstract":4719},"lrec2018-main-208","E-magyar – A Digital Language Processing System","10.63317\u002F2x92buwbygd5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-208","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F730.pdf","varadi-etal-2018-e",[4704,4707,4709,4712,4715,4716,4717,4718],{"paper_id":4697,"author_seq":247,"given_name":4705,"surname":4706,"affiliation":63,"orcid":63},"Tamás","Váradi",{"paper_id":4697,"author_seq":232,"given_name":4708,"surname":1269,"affiliation":63,"orcid":63},"Eszter",{"paper_id":4697,"author_seq":218,"given_name":4710,"surname":4711,"affiliation":63,"orcid":63},"Bálint","Sass",{"paper_id":4697,"author_seq":203,"given_name":4713,"surname":4714,"affiliation":63,"orcid":63},"Iván","Mittelholcz",{"paper_id":4697,"author_seq":188,"given_name":976,"surname":977,"affiliation":63,"orcid":63},{"paper_id":4697,"author_seq":172,"given_name":2696,"surname":2697,"affiliation":63,"orcid":63},{"paper_id":4697,"author_seq":155,"given_name":2053,"surname":2054,"affiliation":63,"orcid":63},{"paper_id":4697,"author_seq":138,"given_name":2044,"surname":2045,"affiliation":63,"orcid":63},"e-magyar is a new toolset for the analysis of Hungarian texts. It was produced as a collaborative effort of the Hungarian language technology community integrating the best state of the art tools, enhancing them where necessary, making them interoperable and releasing them with a clear license. It is a free, open, modular text processing pipeline which is integrated in the GATE system offering further prospects of interoperability. From tokenizing to parsing and named entity recognition, existing tools were examined and those selected for integration underwent various amount of overhaul in order to operate in the pipeline with a uniform encoding, and run in the same Java platform. The tokenizer was re-built from ground up and the flagship module, the morphological analyzer, based on the Humor system, was given a new annotation system and was implemented in the HFST framework. The system is aimed for a broad range of users, from language technology application developers to digital humanities researchers alike. It comes with a drag-and-drop demo on its website: http:\u002F\u002Fe-magyar.hu\u002Fen\u002F.",{"paper_id":4721,"title":4722,"year":81,"month":855,"day":63,"doi":4723,"resource_url":4724,"first_page":63,"last_page":63,"pdf_url":4725,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4726,"paper_type":860,"authors":4727,"abstract":4749},"lrec2018-main-209","ILCM - A Virtual Research Infrastructure for Large-Scale Qualitative Data","10.63317\u002F2meydjiquiv4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-209","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F734.pdf","niekler-etal-2018-ilcm",[4728,4730,4733,4735,4738,4741,4744,4747],{"paper_id":4721,"author_seq":247,"given_name":3238,"surname":4729,"affiliation":63,"orcid":63},"Niekler",{"paper_id":4721,"author_seq":232,"given_name":4731,"surname":4732,"affiliation":63,"orcid":63},"Arnim","Bleier",{"paper_id":4721,"author_seq":218,"given_name":904,"surname":4734,"affiliation":63,"orcid":63},"Kahmann",{"paper_id":4721,"author_seq":203,"given_name":4736,"surname":4737,"affiliation":63,"orcid":63},"Lisa","Posch",{"paper_id":4721,"author_seq":188,"given_name":4739,"surname":4740,"affiliation":63,"orcid":63},"Gregor","Wiedemann",{"paper_id":4721,"author_seq":172,"given_name":4742,"surname":4743,"affiliation":63,"orcid":63},"Kenan","Erdogan",{"paper_id":4721,"author_seq":155,"given_name":4745,"surname":4746,"affiliation":63,"orcid":63},"Gerhard","Heyer",{"paper_id":4721,"author_seq":138,"given_name":1771,"surname":4748,"affiliation":63,"orcid":63},"Strohmaier","The iLCM project pursues the development of an integrated research environment for the analysis of structured and unstructured data in a “Software as a Service” architecture (SaaS). The research environment addresses requirements for the quantitative evaluation of large amounts of qualitative data with text mining methods as well as requirements for the reproducibility of data-driven research designs in the social sciences.  For this, the iLCM research environment comprises two central components.  First, the Leipzig Corpus Miner (LCM), a decentralized SaaS application for the analysis of large amounts of news texts developed in a previous Digital Humanities project.        Second, the text mining tools implemented in the LCM are extended by an “Open Research Computing” (ORC) environment for executable script documents, so-called “notebooks”.  This novel integration allows to combine generic, high-performance methods to  process  large  amounts  of  unstructured        text  data  and  with  individual  program  scripts  to  address  specific  research  requirements  in computational social science and digital humanities. ilcm.informatik.uni-leipzig.de",{"paper_id":4751,"title":4752,"year":81,"month":855,"day":63,"doi":4753,"resource_url":4754,"first_page":63,"last_page":63,"pdf_url":4755,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4756,"paper_type":860,"authors":4757,"abstract":4767},"lrec2018-main-210","CLARIN’s Key Resource Families","10.63317\u002F4cmws2zbo4nm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-210","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F829.pdf","fiser-etal-2018-clarins",[4758,4761,4764],{"paper_id":4751,"author_seq":247,"given_name":4759,"surname":4760,"affiliation":63,"orcid":63},"Darja","Fišer",{"paper_id":4751,"author_seq":232,"given_name":4762,"surname":4763,"affiliation":63,"orcid":63},"Jakob","Lenardič",{"paper_id":4751,"author_seq":218,"given_name":4765,"surname":4766,"affiliation":63,"orcid":63},"Tomaž","Erjavec","CLARIN is a European Research Infrastructure that has been established to support the accessibility of language resources and technologies to researchers from the Digital Humanities and Social Sciences. This paper presents CLARIN’s Key Resource Families, a new initiative within the infrastructure, the goal of which is to collect and present in a uniform way the most prominent data types in the network of CLARIN consortia that display a high degree of maturity, are available for most EU languages, are a rich source of social and cultural data, and as such are highly relevant for research from a wide range of disciplines and methodological approaches in the Digital Humanities and Social Sciences as well as for cross-disciplinary and trans-national comparative research. The four resource families that we present each in turn are newspaper, parliamentary, CMC (computer-mediated communication), and parallel corpora. We focus on their presentation within the infrastructure, their metadata in terms of size, temporal coverage, annotation, accessibility and license, and discuss current problems.",{"paper_id":4769,"title":4770,"year":81,"month":855,"day":63,"doi":4771,"resource_url":4772,"first_page":63,"last_page":63,"pdf_url":4773,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4774,"paper_type":860,"authors":4775,"abstract":4793},"lrec2018-main-211","Indra: A Word Embedding and Semantic Relatedness Server","10.63317\u002F3nxdt42iyznk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-211","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F914.pdf","sales-etal-2018-indra",[4776,4779,4781,4784,4787,4790],{"paper_id":4769,"author_seq":247,"given_name":4777,"surname":4778,"affiliation":63,"orcid":63},"Juliano Efson","Sales",{"paper_id":4769,"author_seq":232,"given_name":1956,"surname":4780,"affiliation":63,"orcid":63},"Souza",{"paper_id":4769,"author_seq":218,"given_name":4782,"surname":4783,"affiliation":63,"orcid":63},"Siamak","Barzegar",{"paper_id":4769,"author_seq":203,"given_name":4785,"surname":4786,"affiliation":63,"orcid":63},"Brian","Davis",{"paper_id":4769,"author_seq":188,"given_name":4788,"surname":4789,"affiliation":63,"orcid":63},"André","Freitas",{"paper_id":4769,"author_seq":172,"given_name":4791,"surname":4792,"affiliation":63,"orcid":63},"Siegfried","Handschuh","In recent years word embedding\u002Fdistributional semantic models evolved to become a fundamental component in many natural language processing (NLP) architectures due to their ability of capturing and quantifying semantic associations at scale. Word embedding models can be used to satisfy recurrent tasks in NLP such as lexical and semantic generalisation in machine learning tasks, finding similar or related words and computing semantic relatedness of terms. However, building and consuming specific word embedding models require the setting of a large set of configurations, such as corpus-dependant parameters, distance measures as well as compositional models. Despite their increasing relevance as a component in NLP architectures, existing frameworks provide limited options in their ability to systematically build, parametrise, compare and evaluate different models. To answer this demand, this paper describes INDRA , a multi-lingual word embedding\u002Fdistributional semantics framework which supports the creation, use and evaluation of word embedding models. In addition to the tool, INDRA also shares more than 65 pre-computed models in 14 languages.",{"paper_id":4795,"title":4796,"year":81,"month":855,"day":63,"doi":4797,"resource_url":4798,"first_page":63,"last_page":63,"pdf_url":4799,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4800,"paper_type":860,"authors":4801,"abstract":4805},"lrec2018-main-212","A UIMA Database Interface for Managing NLP-related Text Annotations","10.63317\u002F2dcoz4j3hers","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-212","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F938.pdf","abrami-mehler-2018-uima",[4802,4804],{"paper_id":4795,"author_seq":247,"given_name":1679,"surname":4803,"affiliation":63,"orcid":63},"Abrami",{"paper_id":4795,"author_seq":232,"given_name":2736,"surname":4012,"affiliation":63,"orcid":63},"NLP and automatic text analysis necessarily involve the annotation of natural language texts. The Apache Unstructured Information Management applications (UIMA) framework is used in several projects, tools and resources, and has become a de facto standard in this area. Despite the multiple use of UIMA as a document-based schema, it does not provide native database support. In order to facilitate distributed storage and enable UIMA-based projects to perform targeted queries, we have developed the UIMA Database Interface (UIMA DI). UIMA DI sets up an environment for a generic use of UIMA documents in database systems. In addition, the integration of UIMA DI into rights and resource management tools enables user and group-specific access to UIMA documents and provides data protection. Finally, UIMA documents can be made accessible for third party programs. UIMA DI, which we evaluate in relation to file system-based storage, is available under the GPLv3 license via GitHub.",{"paper_id":4807,"title":4808,"year":81,"month":855,"day":63,"doi":4809,"resource_url":4810,"first_page":63,"last_page":63,"pdf_url":4811,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4812,"paper_type":860,"authors":4813,"abstract":4829},"lrec2018-main-213","European Language Resource Coordination: Collecting Language Resources for Public Sector Multilingual Information Management","10.63317\u002F5cckwtxra6op","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-213","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1119.pdf","losch-etal-2018-european",[4814,4816,4817,4818,4821,4824,4825,4826,4827],{"paper_id":4807,"author_seq":247,"given_name":2516,"surname":4815,"affiliation":63,"orcid":63},"Lösch",{"paper_id":4807,"author_seq":232,"given_name":1317,"surname":1318,"affiliation":63,"orcid":63},{"paper_id":4807,"author_seq":218,"given_name":4632,"surname":4633,"affiliation":63,"orcid":63},{"paper_id":4807,"author_seq":203,"given_name":4819,"surname":4820,"affiliation":63,"orcid":63},"Andrejs","Vasiļjevs",{"paper_id":4807,"author_seq":188,"given_name":4822,"surname":4823,"affiliation":63,"orcid":63},"Lilli","Smal",{"paper_id":4807,"author_seq":172,"given_name":881,"surname":1555,"affiliation":63,"orcid":63},{"paper_id":4807,"author_seq":155,"given_name":2757,"surname":2758,"affiliation":63,"orcid":63},{"paper_id":4807,"author_seq":138,"given_name":1320,"surname":1321,"affiliation":63,"orcid":63},{"paper_id":4807,"author_seq":121,"given_name":2804,"surname":4828,"affiliation":63,"orcid":63},"van Genabith","In order to help improve the quality, coverage and performance of automated translation solutions in the context of current and future Connecting Europe Facility (CEF) digital services, the European Language Resource Coordination (ELRC) consortium was set up through a service contract operating under the European Commission’s CEF SMART 2014\u002F1074 programme to initiate a number of actions to support the collection of Language Resources (LRs) within the public sector. The first action consisted in raising awareness in the public sector through the organisation of dedicated events: 2 conferences and 29 country-specific workshops to meet with national or regional\u002Fmunicipal governmental organisations, language competence centres, relevant European institutions and other potential holders of LRs from the public service administrations. In order to gather resources shared by the contributors, the ELRC-SHARE Repository was built up together with services to support the sharing of LRs, such as the ELRC Helpdesk and Intellectual property Rights (IPR) clearance support. All collected LRs should pass a validation process whose guidelines were developed within the project. The collected LRs cover all official EU languages, plus Icelandic and Norwegian.",{"paper_id":4831,"title":4832,"year":81,"month":855,"day":63,"doi":4833,"resource_url":4834,"first_page":63,"last_page":63,"pdf_url":4835,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4836,"paper_type":860,"authors":4837,"abstract":4853},"lrec2018-main-214","Tilde MT Platform for Developing Client Specific MT Solutions","10.63317\u002F4h5bn6ysvwjv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-214","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8882.pdf","pinnis-etal-2018-tilde",[4838,4841,4842,4845,4847,4850],{"paper_id":4831,"author_seq":247,"given_name":4839,"surname":4840,"affiliation":63,"orcid":63},"Mārcis","Pinnis",{"paper_id":4831,"author_seq":232,"given_name":4819,"surname":4820,"affiliation":63,"orcid":63},{"paper_id":4831,"author_seq":218,"given_name":4843,"surname":4844,"affiliation":63,"orcid":63},"Rihards","Kalniņš",{"paper_id":4831,"author_seq":203,"given_name":1701,"surname":4846,"affiliation":63,"orcid":63},"Rozis",{"paper_id":4831,"author_seq":188,"given_name":4848,"surname":4849,"affiliation":63,"orcid":63},"Raivis","Skadiņš",{"paper_id":4831,"author_seq":172,"given_name":4851,"surname":4852,"affiliation":63,"orcid":63},"Valters","Šics","In this paper, we present Tilde MT, a custom machine translation platform that provides linguistic data storage (parallel, monolingual corpora, multilingual term collections), data cleaning and normalisation, statistical and neural machine translation system training and hosting functionality, as well as wide integration capabilities (a machine user API and popular computer-assisted translation tool plugins). We provide details for the most important features of the platform, as well as elaborate typical MT system training workflows for client-specific MT solution development.",{"paper_id":4855,"title":4856,"year":81,"month":855,"day":63,"doi":4857,"resource_url":4858,"first_page":63,"last_page":63,"pdf_url":4859,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4860,"paper_type":860,"authors":4861,"abstract":4871},"lrec2018-main-215","Improving homograph disambiguation with supervised machine learning","10.63317\u002F26f425yehm3f","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-215","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8880.pdf","gorman-etal-2018-improving",[4862,4865,4868],{"paper_id":4855,"author_seq":247,"given_name":4863,"surname":4864,"affiliation":63,"orcid":63},"Kyle","Gorman",{"paper_id":4855,"author_seq":232,"given_name":4866,"surname":4867,"affiliation":63,"orcid":63},"Gleb","Mazovetskiy",{"paper_id":4855,"author_seq":218,"given_name":4869,"surname":4870,"affiliation":63,"orcid":63},"Vitaly","Nikolaev","We describe a pre-existing rule-based homograph disambiguation system used for text-to-speech synthesis at Google, and compare it against a novel system which performs disambiguation using classifiers trained on a small amount of labeled data. An evaluation of these systems, using a new, freely available English data set, finds that hybrid systems (making use of both rules and machine learning) are significantly more accurate than either hand-written rules or machine learning alone. The evaluation also finds minimal performance degradation when the hybrid system is configured to run on limited-resource mobile devices rather than on production servers. The two best systems described here are used for homograph disambiguation on all American English text-to-speech traffic at Google.",{"paper_id":4873,"title":4874,"year":81,"month":855,"day":63,"doi":4875,"resource_url":4876,"first_page":63,"last_page":63,"pdf_url":4877,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4878,"paper_type":860,"authors":4879,"abstract":4898},"lrec2018-main-216","Text Normalization Infrastructure that Scales to Hundreds of Language Varieties","10.63317\u002F57m3aa8gyujx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-216","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8883.pdf","chua-etal-2018-text",[4880,4883,4886,4889,4892,4895],{"paper_id":4873,"author_seq":247,"given_name":4881,"surname":4882,"affiliation":63,"orcid":63},"Mason","Chua",{"paper_id":4873,"author_seq":232,"given_name":4884,"surname":4885,"affiliation":63,"orcid":63},"Daan","van Esch",{"paper_id":4873,"author_seq":218,"given_name":4887,"surname":4888,"affiliation":63,"orcid":63},"Noah","Coccaro",{"paper_id":4873,"author_seq":203,"given_name":4890,"surname":4891,"affiliation":63,"orcid":63},"Eunjoon","Cho",{"paper_id":4873,"author_seq":188,"given_name":4893,"surname":4894,"affiliation":63,"orcid":63},"Sujeet","Bhandari",{"paper_id":4873,"author_seq":172,"given_name":4896,"surname":4897,"affiliation":63,"orcid":63},"Libin","Jia","We describe the automated multi-language text normalization infrastructure that prepares textual data to train language models used in Google’s keyboards and speech recognition systems, across hundreds of language varieties. Training corpora are sourced from various types of data sets, and the text is then normalized using a sequence of hand-written grammars and learned models. These systems need to scale to hundreds or thousands of language varieties in order to meet product needs. Frequent data refreshes, privacy considerations and simultaneous updates across such a high number of languages make manual inspection of the normalized training data infeasible, while there is ample opportunity for data normalization issues. By tracking metrics about the data and how it was processed, we are able to catch internal data processing issues and external data corruption issues that can be hard to notice using standard extrinsic evaluation methods. Showing the importance of paying attention to data normalization behavior in large-scale pipelines, these metrics have highlighted issues in Google’s real-world speech recognition system that have caused significant, but latent, quality degradation.",{"paper_id":4900,"title":4901,"year":81,"month":855,"day":63,"doi":4902,"resource_url":4903,"first_page":63,"last_page":63,"pdf_url":4904,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4905,"paper_type":860,"authors":4906,"abstract":4913},"lrec2018-main-217","DeModify: A Dataset for Analyzing Contextual Constraints on Modifier Deletion","10.63317\u002F3dchvp89kh4j","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-217","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F111.pdf","nastase-etal-2018-demodify",[4907,4908,4911],{"paper_id":4900,"author_seq":247,"given_name":3074,"surname":3075,"affiliation":63,"orcid":63},{"paper_id":4900,"author_seq":232,"given_name":4909,"surname":4910,"affiliation":63,"orcid":63},"Devon","Fritz",{"paper_id":4900,"author_seq":218,"given_name":4912,"surname":3326,"affiliation":63,"orcid":63},"Anette","Tasks such as knowledge extraction, text simplification and summarization have in common the fact that from a text fragment a smaller (not necessarily contiguous) portion is obtained by discarding part of the context. This may cause the text fragment to acquire a new meaning, or even to become false. The smallest units that can be considered disposable in a larger context are modifiers. In this paper we describe a dataset collected and annotated to facilitate the study of the influence of modifiers on the meaning of the context they are part of, and to support the development of models that can determine whether a modifier can be removed without undesirable semantic consequences.",{"paper_id":4915,"title":4916,"year":81,"month":855,"day":63,"doi":4917,"resource_url":4918,"first_page":63,"last_page":63,"pdf_url":4919,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4920,"paper_type":860,"authors":4921,"abstract":4924},"lrec2018-main-218","Open Subtitles Paraphrase Corpus for Six Languages","10.63317\u002F4kki5rwb9a2a","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-218","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F131.pdf","creutz-2018-open",[4922],{"paper_id":4915,"author_seq":247,"given_name":4323,"surname":4923,"affiliation":63,"orcid":63},"Creutz","This paper accompanies the release of Opusparcus, a new paraphrase corpus for six European languages: German, English, Finnish, French, Russian, and Swedish. The corpus consists of paraphrases, that is, pairs of sentences in the same language that mean approximately the same thing. The paraphrases are extracted from the OpenSubtitles2016 corpus, which contains subtitles from movies and TV shows. The informal and colloquial genre that occurs in subtitles makes such data a very interesting language resource, for instance, from the perspective of computer assisted language learning. For each target language, the Opusparcus data have been  partitioned into three types of data sets: training, development and test sets. The training sets are large, consisting of millions of sentence pairs, and have been compiled automatically, with the help of probabilistic ranking functions. The development and test sets consist of sentence pairs that have been checked manually; each set contains approximately 1000 sentence pairs that have been verified to be acceptable paraphrases by two annotators.",{"paper_id":4926,"title":4927,"year":81,"month":855,"day":63,"doi":4928,"resource_url":4929,"first_page":63,"last_page":63,"pdf_url":4930,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4931,"paper_type":860,"authors":4932,"abstract":4942},"lrec2018-main-219","Fine-grained Semantic Textual Similarity for Serbian","10.63317\u002F3ttcoccvxkzp","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-219","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F442.pdf","batanovic-etal-2018-fine",[4933,4936,4939],{"paper_id":4926,"author_seq":247,"given_name":4934,"surname":4935,"affiliation":63,"orcid":63},"Vuk","Batanović",{"paper_id":4926,"author_seq":232,"given_name":4937,"surname":4938,"affiliation":63,"orcid":63},"Miloš","Cvetanović",{"paper_id":4926,"author_seq":218,"given_name":4940,"surname":4941,"affiliation":63,"orcid":63},"Boško","Nikolić","Although the task of semantic textual similarity (STS) has gained in prominence in the last few years, annotated STS datasets for model training and evaluation, particularly those with fine-grained similarity scores, remain scarce for languages other than English, and practically non-existent for minor ones. In this paper, we present the Serbian Semantic Textual Similarity News Corpus (STS.news.sr) – an STS dataset for Serbian that contains 1192 sentence pairs annotated with fine-grained semantic similarity scores. We describe the process of its creation and annotation, and we analyze and compare our corpus with the existing news-based STS datasets in English and other major languages. Several existing STS models are evaluated on the Serbian STS News Corpus, and a new supervised bag-of-words model that combines part-of-speech weighting with term frequency weighting is proposed and shown to outperform similar methods. Since Serbian is a morphologically rich language, the effect of various morphological normalization tools on STS model performances is considered as well. The Serbian STS News Corpus, the annotation tool and guidelines used in its creation, and the STS model framework used in the evaluation are all made publicly available.",{"paper_id":4944,"title":4945,"year":81,"month":855,"day":63,"doi":4946,"resource_url":4947,"first_page":63,"last_page":63,"pdf_url":4948,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4949,"paper_type":860,"authors":4950,"abstract":4957},"lrec2018-main-220","SPADE: Evaluation Dataset for Monolingual Phrase Alignment","10.63317\u002F5894atgjvr8a","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-220","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F502.pdf","arase-tsujii-2018-spade",[4951,4954],{"paper_id":4944,"author_seq":247,"given_name":4952,"surname":4953,"affiliation":63,"orcid":63},"Yuki","Arase",{"paper_id":4944,"author_seq":232,"given_name":4955,"surname":4956,"affiliation":63,"orcid":63},"Junichi","Tsujii","We create the SPADE (Syntactic Phrase Alignment Dataset for Evaluation) for systematic research on syntactic phrase alignment in paraphrasal sentences.  This is the first dataset to shed lights on syntactic and phrasal paraphrases under linguistically motivated grammar.  Existing datasets available for evaluation on phrasal paraphrase detection define the unit of phrase as simply sequence of words without syntactic structures due to difficulties caused by the non-homographic nature of phrase correspondences in sentential paraphrases.  Different from these, the SPADE provides annotations of gold parse trees by a linguistic expert and gold phrase alignments identified by three annotators.  Consequently, 20,276 phrases are extracted from 201 sentential paraphrases, on which 15,721 alignments are obtained that at least one annotator regarded as paraphrases.  The SPADE is available at Linguistic Data Consortium for future research on paraphrases.  In addition, two metrics are proposed to evaluate to what extent the automatic phrase alignment results agree with the ones identified by humans.  These metrics allow objective comparison of performances of different methods evaluated on the SPADE.  Benchmarks to show performances of humans and the state-of-the-art method are presented as a reference for future SPADE users.",{"paper_id":4959,"title":4960,"year":81,"month":855,"day":63,"doi":4961,"resource_url":4962,"first_page":63,"last_page":63,"pdf_url":4963,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4964,"paper_type":860,"authors":4965,"abstract":4974},"lrec2018-main-221","ETPC - A Paraphrase Identification Corpus Annotated with Extended Paraphrase Typology and Negation","10.63317\u002F4qs8cbd9zj4g","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-221","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F661.pdf","kovatchev-etal-2018-etpc",[4966,4969,4972],{"paper_id":4959,"author_seq":247,"given_name":4967,"surname":4968,"affiliation":63,"orcid":63},"Venelin","Kovatchev",{"paper_id":4959,"author_seq":232,"given_name":4970,"surname":4971,"affiliation":63,"orcid":63},"M. Antònia","Martí",{"paper_id":4959,"author_seq":218,"given_name":2301,"surname":4973,"affiliation":63,"orcid":63},"Salamó","We present the Extended Paraphrase typology (EPT) and the Extended Typology Paraphrase Corpus (ETPC). The EPT typology addresses several practical limitations of existing paraphrase typologies: it is the first typology that copes with the non-paraphrase pairs in the paraphrase identification corpora and distinguishes between contextual and habitual paraphrase types. ETPC is the largest corpus to date annotated with atomic paraphrase types. It is the first corpus with detailed annotation of both the paraphrase and the non-paraphrase pairs and the first corpus annotated with paraphrase and negation. Both new resources contribute to better understanding the paraphrase phenomenon, and allow for studying the relationship between paraphrasing and negation. To the developers of Paraphrase Identification systems ETPC corpus offers better means for evaluation and error analysis. Furthermore, the EPT typology and ETPC corpus emphasize the relationship with other areas of NLP such as Semantic Similarity, Textual Entailment, Summarization and Simplification.",{"paper_id":4976,"title":4977,"year":81,"month":855,"day":63,"doi":4978,"resource_url":4979,"first_page":63,"last_page":63,"pdf_url":4980,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4981,"paper_type":860,"authors":4982,"abstract":4989},"lrec2018-main-222","Introducing a Lexicon of Verbal Polarity Shifters for English","10.63317\u002F3d9x4memsk7e","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-222","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F110.pdf","schulder-etal-2018-introducing",[4983,4985,4986,4987],{"paper_id":4976,"author_seq":247,"given_name":4664,"surname":4984,"affiliation":63,"orcid":63},"Schulder",{"paper_id":4976,"author_seq":232,"given_name":1780,"surname":2799,"affiliation":63,"orcid":63},{"paper_id":4976,"author_seq":218,"given_name":2804,"surname":2805,"affiliation":63,"orcid":63},{"paper_id":4976,"author_seq":203,"given_name":1205,"surname":4988,"affiliation":63,"orcid":63},"Köser","The sentiment polarity of a phrase does not only depend on the polarities of its words, but also on how these are affected by their context. Negation words (e.g. \"not\", \"no\", \"never\") can change the polarity of a phrase. Similarly, verbs and other content words can also act as polarity shifters (e.g. \"fail\", \"deny\", \"alleviate\"). While individually more sparse, they are far more numerous. Among verbs alone, there are more than 1200 shifters. However, sentiment analysis systems barely consider polarity shifters other than negation words. A major reason for this is the scarcity of lexicons and corpora that provide information on them. We introduce a lexicon of verbal polarity shifters that covers the entirety of verbs found in WordNet. We provide a fine-grained annotation of individual word senses, as well as information for each verbal shifter on the syntactic scopes that it can affect.",{"paper_id":4991,"title":4992,"year":81,"month":855,"day":63,"doi":4993,"resource_url":4994,"first_page":63,"last_page":63,"pdf_url":4995,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":4996,"paper_type":860,"authors":4997,"abstract":5001},"lrec2018-main-223","JFCKB: Japanese Feature Change Knowledge Base","10.63317\u002F34nronidk9r6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-223","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F244.pdf","nakamura-kawahara-2018-jfckb",[4998,5000],{"paper_id":4991,"author_seq":247,"given_name":4999,"surname":4455,"affiliation":63,"orcid":63},"Tetsuaki",{"paper_id":4991,"author_seq":232,"given_name":1879,"surname":1880,"affiliation":63,"orcid":63},"Commonsense knowledge plays an essential role in our language activities. Although many projects have aimed to develop language resources for commonsense knowledge, there is little work focusing on connotational meanings. This is because constructing commonsense knowledge including connotational meanings is challenging. In this paper, we present a Japanese knowledge base where arguments in event sentences are associated with various feature changes caused by the events. For example, ``my child'' in ``my wife hits my child'' is associated with some feature changes, such as increase in pain, increase in anger, increase in disgust, and decrease in joy. We constructed this knowledge base through crowdsourcing tasks by gathering feature changes of arguments in event sentences. After the construction of the knowledge base, we conducted an experiment in anaphora resolution using the knowledge base. We regarded anaphora resolution as an antecedent candidate ranking task and used Ranking SVM as the solver. Experimental results demonstrated the usefulness of our feature change knowledge base.",{"paper_id":5003,"title":5004,"year":81,"month":855,"day":63,"doi":5005,"resource_url":5006,"first_page":63,"last_page":63,"pdf_url":5007,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5008,"paper_type":860,"authors":5009,"abstract":5017},"lrec2018-main-224","Quantifying Qualitative Data for Understanding Controversial Issues","10.63317\u002F5kuqdkbfxhb3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-224","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F321.pdf","wojatzki-etal-2018-quantifying",[5010,5012,5013,5016],{"paper_id":5003,"author_seq":247,"given_name":1780,"surname":5011,"affiliation":63,"orcid":63},"Wojatzki",{"paper_id":5003,"author_seq":232,"given_name":1436,"surname":1437,"affiliation":63,"orcid":63},{"paper_id":5003,"author_seq":218,"given_name":5014,"surname":5015,"affiliation":63,"orcid":63},"Torsten","Zesch",{"paper_id":5003,"author_seq":203,"given_name":1490,"surname":1491,"affiliation":63,"orcid":63},"Understanding public opinion on complex controversial issues such as ‘Legalization of Marijuana’ is of considerable importance for a number of objectives. However, an individual’s position on a controversial issue is often not a binary support-or-oppose stance on the issue, but rather a conglomerate of nuanced opinions on various aspects of the issue.  These opinions are often expressed qualitatively in free text in surveys or on social media.  However, quantifying vast amounts of qualitative information remains a significant challenge.  The goal of this work is to provide a new approach for quantifying qualitative data for the understanding of controversial issues.  First, we show how we can engage people directly through crowdsourcing to create a comprehensive dataset of assertions (claims, opinions, etc.) relevant to an issue.  Next, the assertions are judged for agreement and strength of support or opposition.  The collected Dataset of Nuanced Assertions on Controversial Issues (NAoCI dataset) consists of over 2,000 assertions on sixteen different controversial issues.  It has over 100,000 judgments of whether people agree or disagree with the assertions, and of about 70,000 judgments indicating how strongly people support or oppose the assertions.  This dataset allows for several useful analyses that help summarize public opinion.",{"paper_id":5019,"title":5020,"year":81,"month":855,"day":63,"doi":5021,"resource_url":5022,"first_page":63,"last_page":63,"pdf_url":5023,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5024,"paper_type":860,"authors":5025,"abstract":5035},"lrec2018-main-225","Distribution of Emotional Reactions to News Articles in Twitter","10.63317\u002F3ic2p773dmwt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-225","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F416.pdf","gambino-etal-2018-distribution",[5026,5029,5032],{"paper_id":5019,"author_seq":247,"given_name":5027,"surname":5028,"affiliation":63,"orcid":63},"Omar Juárez","Gambino",{"paper_id":5019,"author_seq":232,"given_name":5030,"surname":5031,"affiliation":63,"orcid":63},"Hiram","Calvo",{"paper_id":5019,"author_seq":218,"given_name":5033,"surname":5034,"affiliation":63,"orcid":63},"Consuelo-Varinia","García-Mendoza","Several datasets of opinions expressed by Social networks' users have been created to explore Sentiment Analysis tasks like Sentiment Polarity and Emotion Mining. Most of these datasets are focused on the writers' perspective, that is, the post written by a user is analyzed to determine the expressed sentiment on it. This kind of datasets do not consider the source that provokes those opinions (e.g. a previous post). In this work, we propose a dataset focused on the readers' perspective. The developed dataset contains news articles published by three newspapers and the distribution of six predefined emotions expressed by readers of the articles in Twitter. This dataset was built aiming to explore how the six emotions are expressed by Twitter users' after reading a news article. We show some results of a machine learning method used to predict the distribution of emotions in unseen news articles.",{"paper_id":5037,"title":5038,"year":81,"month":855,"day":63,"doi":5039,"resource_url":5040,"first_page":63,"last_page":63,"pdf_url":5041,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5042,"paper_type":860,"authors":5043,"abstract":5055},"lrec2018-main-226","Aggression-annotated Corpus of Hindi-English Code-mixed Data","10.63317\u002F2otsapvt2qu9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-226","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F861.pdf","kumar-etal-2018-aggression",[5044,5046,5049,5052],{"paper_id":5037,"author_seq":247,"given_name":5045,"surname":3717,"affiliation":63,"orcid":63},"Ritesh",{"paper_id":5037,"author_seq":232,"given_name":5047,"surname":5048,"affiliation":63,"orcid":63},"Aishwarya N.","Reganti",{"paper_id":5037,"author_seq":218,"given_name":5050,"surname":5051,"affiliation":63,"orcid":63},"Akshit","Bhatia",{"paper_id":5037,"author_seq":203,"given_name":5053,"surname":5054,"affiliation":63,"orcid":63},"Tushar","Maheshwari","As the interaction over the web has increased, incidents of aggression and related events like trolling, cyberbullying, flaming, hate speech, etc. too have increased manifold across the globe. While most of these behaviour like bullying or hate speech have predated the Internet, the reach and extent of the Internet has given these an unprecedented power and influence to affect the lives of billions of people. So it is of utmost significance and importance that some preventive measures be taken to provide safeguard to the people using the web such that the web remains a viable medium of communication and connection, in general. In this paper, we discuss the development of an aggression tagset and an annotated corpus of Hindi-English code-mixed data from two of the most popular social networking \u002F social media platforms in India – Twitter and Facebook. The corpus is annotated using a hierarchical tagset of 3 top-level tags and 10 level 2 tags. The final dataset contains approximately 18k tweets and 21k facebook comments and is being released for further research in the field.",{"paper_id":5057,"title":5058,"year":81,"month":855,"day":63,"doi":5059,"resource_url":5060,"first_page":63,"last_page":63,"pdf_url":5061,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5062,"paper_type":860,"authors":5063,"abstract":5068},"lrec2018-main-227","Creating a Verb Synonym Lexicon Based on a Parallel Corpus","10.63317\u002F22gy63y5empd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-227","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F33.pdf","uresova-etal-2018-creating",[5064,5065,5066,5067],{"paper_id":5057,"author_seq":247,"given_name":3455,"surname":3456,"affiliation":63,"orcid":63},{"paper_id":5057,"author_seq":232,"given_name":890,"surname":3458,"affiliation":63,"orcid":63},{"paper_id":5057,"author_seq":218,"given_name":890,"surname":3460,"affiliation":63,"orcid":63},{"paper_id":5057,"author_seq":203,"given_name":2633,"surname":3462,"affiliation":63,"orcid":63},"This paper presents the first findings of our recently started project of building a new lexical resource called CzEngClass, which consists of bilingual verbal synonym groups. In order to create such a resource, we explore semantic ‘equivalence’ of verb senses (across different verb lexemes) in a bilingual (Czech-English) setting by using translational context of real-world texts in a parallel, richly annotated dependency corpus. When grouping semantically equivalent verb senses into classes of synonyms, we focus on valency (arguments as deep dependents with morphosyntactic features relevant for surface dependencies) and its mapping to a set of semantic ``roles'' for verb arguments, common within one class. We argue that the existence of core argument mappings and certain adjunct mappings to a common set of semantic roles is a suitable criterion for a reasonable verb synonymy definition, possibly accompanied with additional contextual restrictions. By mid-2018, the first version of the lexicon called CzEngClass will be publicly available.",{"paper_id":5070,"title":5071,"year":81,"month":855,"day":63,"doi":5072,"resource_url":5073,"first_page":63,"last_page":63,"pdf_url":5074,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5075,"paper_type":860,"authors":5076,"abstract":5086},"lrec2018-main-228","Evaluation of Domain-specific Word Embeddings using Knowledge Resources","10.63317\u002F4jiub6k5vwft","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-228","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F268.pdf","nooralahzadeh-etal-2018-evaluation",[5077,5080,5083],{"paper_id":5070,"author_seq":247,"given_name":5078,"surname":5079,"affiliation":63,"orcid":63},"Farhad","Nooralahzadeh",{"paper_id":5070,"author_seq":232,"given_name":5081,"surname":5082,"affiliation":63,"orcid":63},"Lilja","Øvrelid",{"paper_id":5070,"author_seq":218,"given_name":5084,"surname":5085,"affiliation":63,"orcid":63},"Jan Tore","Lønning","In this work we evaluate domain-specific embedding models induced from textual resources in the Oil and Gas domain. We conduct intrinsic and extrinsic evaluations of both general and domain-specific embeddings and we observe that constructing  domain-specific word embeddings is worthwhile even with a considerably smaller corpus size. Although the intrinsic evaluation shows low performance in synonymy detection, an in-depth error analysis reveals the ability of these models to discover additional semantic relations such as hyponymy, co-hyponymy and relatedness in the target domain. Extrinsic evaluation of the embedding models is provided by a domain-specific sentence classification task, which we solve using a convolutional neural network. We further adapt embedding enhancement methods to provide vector representations for infrequent and unseen terms. Experiments show that the adapted technique can provide improvements both in intrinsic and extrinsic evaluation.",{"paper_id":5088,"title":5089,"year":81,"month":855,"day":63,"doi":5090,"resource_url":5091,"first_page":63,"last_page":63,"pdf_url":5092,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5093,"paper_type":860,"authors":5094,"abstract":5103},"lrec2018-main-229","Automatic Thesaurus Construction for Modern Hebrew","10.63317\u002F23nemfp4exmw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-229","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F460.pdf","liebeskind-etal-2018-automatic",[5095,5098,5101],{"paper_id":5088,"author_seq":247,"given_name":5096,"surname":5097,"affiliation":63,"orcid":63},"Chaya","Liebeskind",{"paper_id":5088,"author_seq":232,"given_name":5099,"surname":5100,"affiliation":63,"orcid":63},"Ido","Dagan",{"paper_id":5088,"author_seq":218,"given_name":1370,"surname":5102,"affiliation":63,"orcid":63},"Schler","Automatic thesaurus construction for Modern Hebrew is a complicated task, due to its high degree of inflectional ambiguity. Linguistics tools, including morphological analyzers, part-of-speech taggers and parsers often have limited in performance on Morphologically Rich Languages (MRLs) such as Hebrew. In this paper, we adopted a schematic methodology for generating a co-occurrence based thesaurus in a MRL and extended the methodology to create distributional similarity thesaurus. We explored three alternative levels of morphological term representations, surface form, lemma, and multiple lemmas, all complemented by the clustering of morphological variants. First, we evaluated both the co-occurrence based method and the distributional similarity method using Hebrew WordNet as our gold standard. However, due to Hebrew WordNet's low coverage, we completed our analysis with a manual evaluation. The results showed that for Modern Hebrew corpus-based thesaurus construction, the most directly applied statistical collection, using linguistics tools at the lemma level, is not optimal.",{"paper_id":5105,"title":5106,"year":81,"month":855,"day":63,"doi":5107,"resource_url":5108,"first_page":63,"last_page":63,"pdf_url":5109,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5110,"paper_type":860,"authors":5111,"abstract":5116},"lrec2018-main-230","Automatic Wordnet Mapping: from CoreNet to Princeton WordNet","10.63317\u002F3qu5ixou95w3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-230","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F892.pdf","kim-etal-2018-automatic",[5112,5113,5114,5115],{"paper_id":5105,"author_seq":247,"given_name":1103,"surname":1104,"affiliation":63,"orcid":63},{"paper_id":5105,"author_seq":232,"given_name":1100,"surname":1101,"affiliation":63,"orcid":63},{"paper_id":5105,"author_seq":218,"given_name":1106,"surname":1107,"affiliation":63,"orcid":63},{"paper_id":5105,"author_seq":203,"given_name":1109,"surname":1110,"affiliation":63,"orcid":63},"CoreNet is a lexico-semantic network of 73,100 Korean word senses, which are categorized under 2,937 semantic categories organized in a taxonomy. Recently, to foster the more widespread use of CoreNet, there was an attempt to map the semantic categories of CoreNet into synsets of Princeton WordNet by lexical relations such as synonymy, hyponymy, and hypernymy relations. One of the limitations of the existing mapping is that it is only focused on mapping the semantic categories, but not on mapping the word senses, which are the majority part (96%) of CoreNet. To boost bridging the gap between CoreNet and WordNet, we introduce the automatic mapping approach to link the word senses of CoreNet into WordNet synsets. The evaluation shows that our approach successfully maps previously unmapped 38,028 word senses into WordNet synsets with the precision of 91.2% (±1.14 with 99% confidence).",{"paper_id":5118,"title":5119,"year":81,"month":855,"day":63,"doi":5120,"resource_url":5121,"first_page":63,"last_page":63,"pdf_url":5122,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5123,"paper_type":860,"authors":5124,"abstract":5139},"lrec2018-main-231","The New Propbank: Aligning Propbank with AMR through POS Unification","10.63317\u002F42ex46dma8qc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-231","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1074.pdf","ogorman-etal-2018-new",[5125,5128,5131,5132,5134,5137],{"paper_id":5118,"author_seq":247,"given_name":5126,"surname":5127,"affiliation":63,"orcid":63},"Tim","O’Gorman",{"paper_id":5118,"author_seq":232,"given_name":5129,"surname":5130,"affiliation":63,"orcid":63},"Sameer","Pradhan",{"paper_id":5118,"author_seq":218,"given_name":1022,"surname":1023,"affiliation":63,"orcid":63},{"paper_id":5118,"author_seq":203,"given_name":2984,"surname":5133,"affiliation":63,"orcid":63},"Bonn",{"paper_id":5118,"author_seq":188,"given_name":5135,"surname":5136,"affiliation":63,"orcid":63},"Katie","Conger",{"paper_id":5118,"author_seq":172,"given_name":1016,"surname":5138,"affiliation":63,"orcid":63},"Gung","We present a corpus which converts the sense labels of existing Propbank resources to a new unified format which is more compatible with AMR and more robust to sparsity. This adopts an innovation of the Abstract Meaning Representation project(Banarescu et al. 2013) in which one abstracts away from different, related parts of speech, so that related forms such as \"insert\" and \"insertion\" could be represented by the same roleset and use the same semantic roles. We note that this conversion also serves to make the different English Propbank corpora released over the years consistent with each other, so that one might train and evaluate systems upon that larger combined data.   We present analysis of some appealing characteristics of this final dataset, and present preliminary results of training and evaluating SRL systems on this combined set, to spur usage of this challenging new dataset.",{"paper_id":5141,"title":5142,"year":81,"month":855,"day":63,"doi":5143,"resource_url":5144,"first_page":63,"last_page":63,"pdf_url":5145,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5146,"paper_type":860,"authors":5147,"abstract":5157},"lrec2018-main-232","The Boarnsterhim Corpus: A Bilingual Frisian-Dutch Panel and Trend Study","10.63317\u002F26u2zhaxjc69","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-232","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F29.pdf","sloos-etal-2018-boarnsterhim",[5148,5151,5154],{"paper_id":5141,"author_seq":247,"given_name":5149,"surname":5150,"affiliation":63,"orcid":63},"Marjoleine","Sloos",{"paper_id":5141,"author_seq":232,"given_name":5152,"surname":5153,"affiliation":63,"orcid":63},"Eduard","Drenth",{"paper_id":5141,"author_seq":218,"given_name":5155,"surname":5156,"affiliation":63,"orcid":63},"Wilbert","Heeringa","The Boarnsterhim Corpus consists of 250 hours of speech in both West Frisian and Dutch by the same sample of bilingual speakers. The corpus contains original recordings from 1982-1984 and a replication study recorded 35 years later. The data collection spans speech of four generations, and combines panel and trend data. This paper describes the Boarnsterhim Corpus halfway the project which started in 2016 and describes the way it was collected, the annotations, potential use, and the envisaged tools and end-user web application.",{"paper_id":5159,"title":5160,"year":81,"month":855,"day":63,"doi":5161,"resource_url":5162,"first_page":63,"last_page":63,"pdf_url":5163,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5164,"paper_type":860,"authors":5165,"abstract":5175},"lrec2018-main-233","The French-Algerian Code-Switching Triggered audio corpus (FACST)","10.63317\u002F3zcobqdy98az","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-233","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F801.pdf","djegdjiga-etal-2018-french",[5166,5169,5172],{"paper_id":5159,"author_seq":247,"given_name":5167,"surname":5168,"affiliation":63,"orcid":63},"Amazouz","Djegdjiga",{"paper_id":5159,"author_seq":232,"given_name":5170,"surname":5171,"affiliation":63,"orcid":63},"Martine","Adda-Decker",{"paper_id":5159,"author_seq":218,"given_name":5173,"surname":5174,"affiliation":63,"orcid":63},"Lori","Lamel","The French Algerian Code-Switching Triggered corpus (FACST) was created in order to support a variety of studies in phonetics, prosody and natural language processing. The first aim of the FACST corpus is to collect a spontaneous Code-switching speech (CS) corpus. In order to obtain a large quantity of spontaneous CS utterances in natural conversations experiments were carried out on how to elicit CS. Applying a triggering protocol by means of code-switched questions was found to be effective in eliciting CS in the responses. To ensure good audio quality, all recordings were made in a soundproof room or in a very calm room. This paper describes FACST corpus, along with the principal steps to build a CS speech corpus in French-Algerian languages and data collection steps. We also explain the selection criteria for the CS speakers and the recording protocols used. We present the methods used for data segmentation and annotation, and propose a conventional transcription of this type of speech in each language with the aim of being well-suited for both computational linguistic and acoustic-phonetic studies. We provide an a quantitative description of the FACST corpus along with results of linguistic studies, and discuss some of the challenges we faced in collecting CS data.",{"paper_id":5177,"title":5178,"year":81,"month":855,"day":63,"doi":5179,"resource_url":5180,"first_page":63,"last_page":63,"pdf_url":5181,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5182,"paper_type":860,"authors":5183,"abstract":5195},"lrec2018-main-234","Strategies and Challenges for Crowdsourcing Regional Dialect Perception Data for Swiss German and Swiss French","10.63317\u002F42yv7aekrt37","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-234","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F920.pdf","goldman-etal-2018-strategies",[5184,5187,5189,5192],{"paper_id":5177,"author_seq":247,"given_name":5185,"surname":5186,"affiliation":63,"orcid":63},"Jean-Philippe","Goldman",{"paper_id":5177,"author_seq":232,"given_name":1269,"surname":5188,"affiliation":63,"orcid":63},"Clematide",{"paper_id":5177,"author_seq":218,"given_name":5190,"surname":5191,"affiliation":63,"orcid":63},"Mathieu","Avanzi",{"paper_id":5177,"author_seq":203,"given_name":5193,"surname":5194,"affiliation":63,"orcid":63},"Raphael","Tandler","Following the dynamics of several recent crowdsourcing projects with the aim of collecting linguistic data, this paper focuses on such a project in the field of Swiss German dialects and Swiss French accents. The main scientific goal of the data collected is to understand people’s perception of dialects and accents, and provide a resource for future computational systems such as automatic dialect recognition. A gamified crowdsourcing platform was set up and launched for both main locales of Switzerland: “din dialäkt” (‘your dialect’) for Swiss German dialects and “ton accent” (‘your accent’) for Swiss French. The main activity for the participant is to localize preselected audio samples by clicking on a map of Switzerland.  The media was highly interested in the two platforms and many reports appeared in newspapers, television and radio, which increased the public’s awareness of the project and thus also the traffic on the page. At this point of the project, 7,500 registered users (beside 30,000 anonymous visitors), have provided 470,000 localizations. By connecting user’s results of this localization task to their socio-demographic information, a quantitative analysis of the localization data can reveal which factors play a role in their performance. Preliminary results showed that age and childhood residence influence the how well dialects\u002Faccents are recognized. Nevertheless, quantity does not ensure quality when it comes to data. Crowdsourcing such linguistic data revealed traps to avoid such as scammers, or the participants’ quick loss of motivation causing them to click randomly. Such obstacles need to be taken into account when assessing the reliability of data and require a number of preliminary steps before an analysis of the data.",{"paper_id":5197,"title":5198,"year":81,"month":855,"day":63,"doi":5199,"resource_url":5200,"first_page":63,"last_page":63,"pdf_url":5201,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5202,"paper_type":860,"authors":5203,"abstract":5221},"lrec2018-main-235","Phonetically Balanced Code-Mixed Speech Corpus for Hindi-English Automatic Speech Recognition","10.63317\u002F3xo52xmbi9zq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-235","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F940.pdf","pandey-etal-2018-phonetically",[5204,5207,5210,5212,5215,5218],{"paper_id":5197,"author_seq":247,"given_name":5205,"surname":5206,"affiliation":63,"orcid":63},"Ayushi","Pandey",{"paper_id":5197,"author_seq":232,"given_name":5208,"surname":5209,"affiliation":63,"orcid":63},"Brij Mohan Lal","Srivastava",{"paper_id":5197,"author_seq":218,"given_name":5211,"surname":3717,"affiliation":63,"orcid":63},"Rohit",{"paper_id":5197,"author_seq":203,"given_name":5213,"surname":5214,"affiliation":63,"orcid":63},"Bhanu Teja","Nellore",{"paper_id":5197,"author_seq":188,"given_name":5216,"surname":5217,"affiliation":63,"orcid":63},"Kasi Sai","Teja",{"paper_id":5197,"author_seq":172,"given_name":5219,"surname":5220,"affiliation":63,"orcid":63},"Suryakanth V.","Gangashetty","The paper presents the development of a phonetically balanced read speech corpus of code-mixed Hindi-English. Phonetic balance in the corpus has been created by selecting sentences that contained triphones lower in frequency than a predefined threshold. The assumption with a compulsory inclusion of such rare units was that the high frequency triphones will inevitably be included. Using this metric, the Pearson's correlation coefficient of the phonetically balanced corpus with a large code-mixed reference corpus was recorded to be 0.996. The data for corpus creation has been extracted from selected sections of Hindi newspapers.These sections contain frequent English insertions in a matrix of Hindi sentence. Statistics on the phone and triphone distribution have been presented, to graphically display the phonetic likeness between the reference corpus and the corpus sampled through our method.",{"paper_id":5223,"title":5224,"year":81,"month":855,"day":63,"doi":5225,"resource_url":5226,"first_page":63,"last_page":63,"pdf_url":5227,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5228,"paper_type":860,"authors":5229,"abstract":5237},"lrec2018-main-236","Chinese-Portuguese Machine Translation: A Study on Building Parallel Corpora from Comparable Texts","10.63317\u002F3eijvi7dhib3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-236","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F618.pdf","liu-etal-2018-chinese",[5230,5233,5235],{"paper_id":5223,"author_seq":247,"given_name":5231,"surname":5232,"affiliation":63,"orcid":63},"Siyou","Liu",{"paper_id":5223,"author_seq":232,"given_name":5234,"surname":1588,"affiliation":63,"orcid":63},"Longyue",{"paper_id":5223,"author_seq":218,"given_name":5236,"surname":5232,"affiliation":63,"orcid":63},"Chao-Hong","Although there are increasing and significant ties between China and Portuguese-speaking countries, there is not much parallel corpora in the Chinese-Portuguese language pair. Both languages are very populous, with 1.2 billion native Chinese speakers and 279 million native Portuguese speakers, the language pair, however, could be considered as low-resource in terms of available parallel corpora. In this paper, we describe our methods to curate Chinese-Portuguese parallel corpora and evaluate their quality. We extracted bilingual data from Macao government websites and proposed a hierarchical strategy to build a large parallel corpus. Experiments are conducted on existing and our corpora using both Phrased-Based Machine Translation (PBMT) and the state-of-the-art Neural Machine Translation (NMT) models. The results of this work can be used as a benchmark for future Chinese-Portuguese MT systems. The approach we used in this paper also show a good example on how to boost performance of MT systems for low-resource language pairs.",{"paper_id":5239,"title":5240,"year":81,"month":855,"day":63,"doi":5241,"resource_url":5242,"first_page":63,"last_page":63,"pdf_url":5243,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5244,"paper_type":860,"authors":5245,"abstract":5253},"lrec2018-main-237","Evaluating the WordsEye Text-to-Scene System: Imaginative and Realistic Sentences","10.63317\u002F47wuf8ffetux","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-237","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F115.pdf","ulinski-etal-2018-evaluating",[5246,5249,5252],{"paper_id":5239,"author_seq":247,"given_name":5247,"surname":5248,"affiliation":63,"orcid":63},"Morgan","Ulinski",{"paper_id":5239,"author_seq":232,"given_name":5250,"surname":5251,"affiliation":63,"orcid":63},"Bob","Coyne",{"paper_id":5239,"author_seq":218,"given_name":2984,"surname":2985,"affiliation":63,"orcid":63},"We describe our evaluation of the WordsEye text-to-scene generation system. We address the problem of evaluating the output of such a system vs. simple search methods to find a picture to illustrate a sentence. To do this, we constructed two sets of test sentences: a set of crowdsourced imaginative sentences and a set of realistic sentences extracted from the PASCAL image caption corpus (Rashtchian et al., 2010). For each sentence, we compared sample pictures found using Google Image Search to those produced by WordsEye. We then crowdsourced judgments as to which picture best illustrated each sentence. For imaginative sentences, pictures produced by WordsEye were preferred, but for realistic sentences, Google Image Search results were preferred. We also used crowdsourcing to obtain a rating for how well each picture illustrated the sentence, from 1 (completely correct) to 5 (completely incorrect). WordsEye pictures had an average rating of 2.58 on imaginative sentences and 2.54 on realistic sentences; Google images had an average rating of 3.82 on imaginative sentences and 1.87 on realistic sentences. We also discuss the sources of errors in theWordsEye system.",{"paper_id":5255,"title":5256,"year":81,"month":855,"day":63,"doi":5257,"resource_url":5258,"first_page":63,"last_page":63,"pdf_url":5259,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5260,"paper_type":860,"authors":5261,"abstract":5275},"lrec2018-main-238","Computer-assisted Speaker Diarization: How to Evaluate Human Corrections","10.63317\u002F49fxp3d7z28y","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-238","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F390.pdf","broux-etal-2018-computer",[5262,5265,5267,5269,5272],{"paper_id":5255,"author_seq":247,"given_name":5263,"surname":5264,"affiliation":63,"orcid":63},"Pierre-Alexandre","Broux",{"paper_id":5255,"author_seq":232,"given_name":1199,"surname":5266,"affiliation":63,"orcid":63},"Doukhan",{"paper_id":5255,"author_seq":218,"given_name":1269,"surname":5268,"affiliation":63,"orcid":63},"Petitrenaud",{"paper_id":5255,"author_seq":203,"given_name":5270,"surname":5271,"affiliation":63,"orcid":63},"Sylvain","Meignier",{"paper_id":5255,"author_seq":188,"given_name":5273,"surname":5274,"affiliation":63,"orcid":63},"Jean","Carrive","In this paper, we present a framework to evaluate the human correction of a speaker diarization. We propose four elementary actions to correct the diarization and an automaton to simulate the correction sequence. A metric is described to evaluate the correction cost. The framework is evaluated using French broadcast news drawn from the REPERE corpus.",{"paper_id":5277,"title":5278,"year":81,"month":855,"day":63,"doi":5279,"resource_url":5280,"first_page":63,"last_page":63,"pdf_url":5281,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5282,"paper_type":860,"authors":5283,"abstract":5287},"lrec2018-main-239","Performance Impact Caused by Hidden Bias of Training Data for Recognizing Textual Entailment","10.63317\u002F26mjmiipm2qf","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-239","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F786.pdf","tsuchiya-2018-performance",[5284],{"paper_id":5277,"author_seq":247,"given_name":5285,"surname":5286,"affiliation":63,"orcid":63},"Masatoshi","Tsuchiya","The quality of training data is one of the crucial problems when a learning-centered approach is employed. This paper proposes a new method to investigate the quality of a large corpus designed for the recognizing textual entailment (RTE) task.              The proposed method, which is inspired by a statistical hypothesis test, consists of two phases: the first phase is to introduce the predictability of textual entailment labels as a null hypothesis which is extremely unacceptable if a target corpus has no hidden bias, and the second phase is to test the null hypothesis using a Naive Bayes model.  The experimental result of the Stanford Natural Language Inference (SNLI) corpus does not reject the null hypothesis.  Therefore, it indicates that the SNLI corpus has a hidden bias which allows prediction of textual entailment labels from hypothesis sentences even if no context information is given by a premise sentence.  This paper also presents the performance impact of NN models for RTE caused by this hidden bias.",{"paper_id":5289,"title":5290,"year":81,"month":855,"day":63,"doi":5291,"resource_url":5292,"first_page":63,"last_page":63,"pdf_url":5293,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5294,"paper_type":860,"authors":5295,"abstract":5302},"lrec2018-main-240","Evaluation of Croatian Word Embeddings","10.63317\u002F5bo2xdmzikfo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-240","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1111.pdf","svoboda-beliga-2018-evaluation",[5296,5299],{"paper_id":5289,"author_seq":247,"given_name":5297,"surname":5298,"affiliation":63,"orcid":63},"Lukáš","Svoboda",{"paper_id":5289,"author_seq":232,"given_name":5300,"surname":5301,"affiliation":63,"orcid":63},"Slobodan","Beliga","Croatian is poorly resourced and highly inflected language from Slavic language family. Nowadays, research is focusing mostly on English. We created a new word analogy dataset based on the original English Word2vec word analogy dataset and added some of the specific linguistic aspects from Croatian language.  Next, we created Croatian WordSim353 and RG65 datasets for a basic evaluation of word similarities. We compared created datasets on two popular word representation models, based on Word2Vec tool and fastText tool. Models has been trained on 1.37B tokens training data corpus and tested on a new robust Croatian word analogy dataset. Results show that models are able to create meaningful word representation. This research has shown that free word order and the higher morphological complexity of Croatian language influences the quality of resulting word embeddings.",{"paper_id":5304,"title":5305,"year":81,"month":855,"day":63,"doi":5306,"resource_url":5307,"first_page":63,"last_page":63,"pdf_url":5308,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5309,"paper_type":860,"authors":5310,"abstract":5317},"lrec2018-main-241","C-HTS: A Concept-based Hierarchical Text Segmentation approach","10.63317\u002F3sh597s47u5k","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-241","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F806.pdf","bayomi-lawless-2018-c",[5311,5314],{"paper_id":5304,"author_seq":247,"given_name":5312,"surname":5313,"affiliation":63,"orcid":63},"Mostafa","Bayomi",{"paper_id":5304,"author_seq":232,"given_name":5315,"surname":5316,"affiliation":63,"orcid":63},"Séamus","Lawless","Hierarchical Text Segmentation is the task of building a hierarchical structure out of text to reflect its sub-topic hierarchy. Current text segmentation approaches are based upon using lexical and\u002For syntactic similarity to identify the coherent segments of text. However, the relationship between segments may be semantic, rather than lexical or syntactic. In this paper we propose C-HTS, a Concept-based Hierarchical Text Segmentation approach that uses the semantic relatedness between text constituents. In this approach, we use the explicit semantic representation of text, a method that replaces keyword-based text representation with concept-based features, automatically extracted from massive human knowledge repositories such as Wikipedia. C-HTS represents the meaning of a piece of text as a weighted vector of knowledge concepts, in order to reason about text. We evaluate the performance of C-HTS on two publicly available datasets. The results show that C-HTS compares favourably with previous state-of-the-art approaches. As Wikipedia is continuously growing, we measured the impact of its growth on segmentation performance. We used three different snapshots of Wikipedia from different years in order to achieve this. The experimental results show that an increase in the size of the knowledge base leads, on average, to greater improvements in hierarchical text segmentation.",{"paper_id":5319,"title":5320,"year":81,"month":855,"day":63,"doi":5321,"resource_url":5322,"first_page":63,"last_page":63,"pdf_url":5323,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5324,"paper_type":860,"authors":5325,"abstract":5332},"lrec2018-main-242","Semantic Supersenses for English Possessives","10.63317\u002F2drsyqxd93ry","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-242","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F963.pdf","blodgett-schneider-2018-semantic",[5326,5329],{"paper_id":5319,"author_seq":247,"given_name":5327,"surname":5328,"affiliation":63,"orcid":63},"Austin","Blodgett",{"paper_id":5319,"author_seq":232,"given_name":5330,"surname":5331,"affiliation":63,"orcid":63},"Nathan","Schneider","We adapt an approach to annotating the semantics of adpositions to also include English possessives, showing that the supersense inventory of Schneider et al. (2017) works for the genitive ’s clitic and possessive pronouns as well as prepositional of. By comprehensively annotating such possessives in an English corpus of web reviews, we demonstrate that the existing supersense categories are readily applicable to possessives. Our corpus will facilitate empirical study of the semantics of the genitive alternation and the development of semantic disambiguation systems.",{"paper_id":5334,"title":5335,"year":81,"month":855,"day":63,"doi":5336,"resource_url":5337,"first_page":63,"last_page":63,"pdf_url":5338,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5339,"paper_type":860,"authors":5340,"abstract":5347},"lrec2018-main-243","A Corpus of Metaphor Novelty Scores for Syntactically-Related Word Pairs","10.63317\u002F3uiggry294vd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-243","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F242.pdf","parde-nielsen-2018-corpus",[5341,5344],{"paper_id":5334,"author_seq":247,"given_name":5342,"surname":5343,"affiliation":63,"orcid":63},"Natalie","Parde",{"paper_id":5334,"author_seq":232,"given_name":5345,"surname":5346,"affiliation":63,"orcid":63},"Rodney","Nielsen","Automatically scoring metaphor novelty is an unexplored topic in natural language processing, and research in this area could benefit a wide range of NLP tasks.  However, no publicly available metaphor novelty datasets currently exist, making it difficult to perform research on this topic.  We introduce a large corpus of metaphor novelty scores for syntactically related word pairs, and release it freely to the research community.  We describe the corpus here, and include an analysis of its score distribution and the types of word pairs included in the corpus.  We also provide a brief overview of standard metaphor detection corpora, to provide the reader with greater context regarding how this corpus compares to other datasets used for different types of computational metaphor processing.  Finally, we establish a performance benchmark to which future researchers can compare, and show that it is possible to learn to score metaphor novelty on our dataset at a rate ignificantly better than chance or naive strategies.",{"paper_id":5349,"title":5350,"year":81,"month":855,"day":63,"doi":5351,"resource_url":5352,"first_page":63,"last_page":63,"pdf_url":5353,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5354,"paper_type":860,"authors":5355,"abstract":5362},"lrec2018-main-244","Improving Hypernymy Extraction with Distributional Semantic Classes","10.63317\u002F4mcvgsrpbbx8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-244","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F234.pdf","panchenko-etal-2018-improving",[5356,5357,5358,5359,5361],{"paper_id":5349,"author_seq":247,"given_name":2736,"surname":2737,"affiliation":63,"orcid":63},{"paper_id":5349,"author_seq":232,"given_name":3943,"surname":3944,"affiliation":63,"orcid":63},{"paper_id":5349,"author_seq":218,"given_name":2733,"surname":2734,"affiliation":63,"orcid":63},{"paper_id":5349,"author_seq":203,"given_name":5360,"surname":2742,"affiliation":63,"orcid":63},"Simone P.",{"paper_id":5349,"author_seq":188,"given_name":1367,"surname":2739,"affiliation":63,"orcid":63},"In this paper, we show how distributionally-induced semantic classes can be helpful  for extracting hypernyms. We  present methods for inducing sense-aware semantic classes using distributional semantics and using these induced semantic classes for filtering noisy hypernymy relations. Denoising of hypernyms is performed by labeling each semantic class with its hypernyms. On the one hand, this allows us to filter out wrong extractions using the global structure of  distributionally similar senses. On the other hand, we infer missing hypernyms via label propagation to cluster terms. We conduct a large-scale crowdsourcing study showing that processing of automatically extracted hypernyms using our approach improves the quality of the hypernymy extraction in terms of both precision and recall. Furthermore, we show the utility of our method in the domain taxonomy induction task, achieving the state-of-the-art results on a SemEval'16 task on taxonomy induction.",{"paper_id":5364,"title":5365,"year":81,"month":855,"day":63,"doi":5366,"resource_url":5367,"first_page":63,"last_page":63,"pdf_url":5368,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5369,"paper_type":860,"authors":5370,"abstract":5382},"lrec2018-main-245","Laying the Groundwork for Knowledge Base Population: Nine Years of Linguistic Resources for TAC KBP","10.63317\u002F2o6ywcrcar6k","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-245","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1047.pdf","getman-etal-2018-laying",[5371,5373,5376,5378,5380],{"paper_id":5364,"author_seq":247,"given_name":1124,"surname":5372,"affiliation":63,"orcid":63},"Getman",{"paper_id":5364,"author_seq":232,"given_name":5374,"surname":5375,"affiliation":63,"orcid":63},"Joe","Ellis",{"paper_id":5364,"author_seq":218,"given_name":1205,"surname":5377,"affiliation":63,"orcid":63},"Strassel",{"paper_id":5364,"author_seq":203,"given_name":5379,"surname":2599,"affiliation":63,"orcid":63},"Zhiyi",{"paper_id":5364,"author_seq":188,"given_name":4518,"surname":5381,"affiliation":63,"orcid":63},"Tracey","Knowledge Base Population (KBP) is an evaluation series within the Text Analysis Conference (TAC) evaluation campaign conducted by the National Institute of Standards and Technology (NIST). Over the past nine years TAC KBP evaluations have targeted information extraction technologies for the population of knowledge bases comprised of entities, relations, and events. Linguistic Data Consortium (LDC) has supported TAC KBP since 2009, developing, maintaining, and distributing linguistic resources in three languages for seven distinct evaluation tracks. This paper describes LDC's resource creation efforts for the various KBP tracks, and highlights changes made over the years to support evolving evaluation requirements.",{"paper_id":5384,"title":5385,"year":81,"month":855,"day":63,"doi":5386,"resource_url":5387,"first_page":63,"last_page":63,"pdf_url":5388,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5389,"paper_type":860,"authors":5390,"abstract":5400},"lrec2018-main-246","A Dataset for Inter-Sentence Relation Extraction using Distant Supervision","10.63317\u002F374gmpeb43yn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-246","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F790.pdf","mandya-etal-2018-dataset",[5391,5394,5395,5398],{"paper_id":5384,"author_seq":247,"given_name":5392,"surname":5393,"affiliation":63,"orcid":63},"Angrosh","Mandya",{"paper_id":5384,"author_seq":232,"given_name":1540,"surname":1541,"affiliation":63,"orcid":63},{"paper_id":5384,"author_seq":218,"given_name":5396,"surname":5397,"affiliation":63,"orcid":63},"Frans","Coenen",{"paper_id":5384,"author_seq":203,"given_name":5135,"surname":5399,"affiliation":63,"orcid":63},"Atkinson","This paper presents a benchmark dataset for the task of inter-sentence relation extraction. The paper explains the distant supervision method followed for creating the dataset for inter-sentence relation extraction,  involving relations previously used for standard intra-sentence relation extraction task. The study evaluates baseline models such as bag-of-words and sequence based recurrent neural network models on the developed dataset and shows that recurrent neural network models as more useful for the task of intra-sentence relation extraction.Comparing the results of the present work on intra-sentence relation extraction with previous work on inter-sentence relation extraction, the study identifies the need for more sophisticated models to handle long-range information between entities across sentences.",{"paper_id":5402,"title":5403,"year":81,"month":855,"day":63,"doi":5404,"resource_url":5405,"first_page":63,"last_page":63,"pdf_url":5406,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5407,"paper_type":860,"authors":5408,"abstract":5417},"lrec2018-main-247","Diacritics Restoration Using Neural Networks","10.63317\u002F4nbymfmstb9a","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-247","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F573.pdf","naplava-etal-2018-diacritics",[5409,5412,5415,5416],{"paper_id":5402,"author_seq":247,"given_name":5410,"surname":5411,"affiliation":63,"orcid":63},"Jakub","Náplava",{"paper_id":5402,"author_seq":232,"given_name":5413,"surname":5414,"affiliation":63,"orcid":63},"Milan","Straka",{"paper_id":5402,"author_seq":218,"given_name":4670,"surname":4671,"affiliation":63,"orcid":63},{"paper_id":5402,"author_seq":203,"given_name":2633,"surname":3462,"affiliation":63,"orcid":63},"In this paper, we describe a novel combination of a character-level recurrent neural-network based model and a language model applied to diacritics restoration. In many cases in the past and still at present, people often replace characters with diacritics with their ASCII counterparts. Despite the fact that the resulting text is usually easy to understand for humans, it is much harder for further computational processing. This paper opens with a discussion of applicability of restoration of diacritics in selected languages. Next, we present a neural network-based approach to diacritics generation. The core component of our model is a bidirectional recurrent neural network operating at a character level. We evaluate the model on two existing datasets consisting of four European languages. When combined with a language model, our model reduces the error of current best systems by 20% to 64%. Finally, we propose a pipeline for obtaining consistent diacritics restoration datasets for twelve languages and evaluate our model on it. All the code is available under open source license on https:\u002F\u002Fgithub.com\u002Farahusky\u002Fdiacritics_restoration.",{"paper_id":5419,"title":5420,"year":81,"month":855,"day":63,"doi":5421,"resource_url":5422,"first_page":63,"last_page":63,"pdf_url":5423,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5424,"paper_type":860,"authors":5425,"abstract":5431},"lrec2018-main-248","Ensemble Romanian Dependency Parsing with Neural Networks","10.63317\u002F3w2tur3s2erv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-248","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F569.pdf","ion-etal-2018-ensemble",[5426,5429,5430],{"paper_id":5419,"author_seq":247,"given_name":5427,"surname":5428,"affiliation":63,"orcid":63},"Radu","Ion",{"paper_id":5419,"author_seq":232,"given_name":3684,"surname":4361,"affiliation":63,"orcid":63},{"paper_id":5419,"author_seq":218,"given_name":4356,"surname":4357,"affiliation":63,"orcid":63},"SSPR (Semantics-driven Syntactic Parser for Romanian) is a neural network ensemble parser developed for Romanian (a Python 3.5 application based on the Microsoft Cognitive Toolkit 2.0 Python API) that combines the parsing decisions of a varying number (in our experiments, 3) of other parsers (MALT, RGB and MATE), using information from additional lexical, morpho-syntactic and semantic features. SSPR outperforms the best individual parser (MATE in our case) with 1.6% LAS points and it is in the same class with the top 5 Romanian performers at the CONLL 2017 dependency parsing shared task. The train and test sets were extracted from a Romanian dependency treebank we developed and validated in the Universal Dependencies format. The treebank, used in the CONLL 2017 Romanian track as well, is open licenced; the parser is available on request.",{"paper_id":5433,"title":5434,"year":81,"month":855,"day":63,"doi":5435,"resource_url":5436,"first_page":63,"last_page":63,"pdf_url":5437,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5438,"paper_type":860,"authors":5439,"abstract":5447},"lrec2018-main-249","Classifying Sluice Occurrences in Dialogue","10.63317\u002F3be652jfmmtk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-249","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F180.pdf","baird-etal-2018-classifying",[5440,5442,5445],{"paper_id":5433,"author_seq":247,"given_name":5327,"surname":5441,"affiliation":63,"orcid":63},"Baird",{"paper_id":5433,"author_seq":232,"given_name":5443,"surname":5444,"affiliation":63,"orcid":63},"Anissa","Hamza",{"paper_id":5433,"author_seq":218,"given_name":2554,"surname":5446,"affiliation":63,"orcid":63},"Hardt","Ellipsis is an important challenge for natural language processing systems, and addressing that challenge requires large collections of relevant data. The dataset described by Anand and McCloskey (2015), consisting of 4100 occurrences, is an important step towards addressing this issue. However, many NLP technologies require much larger collections of data. Furthermore, previous collections of ellipsis are primarily restricted to news data, although sluicing presents a particularly important challenge for dialogue systems. In this paper we classify sluices as Direct, Reprise, Clarification. We perform manual annotation with acceptable inter-coder agreement. We build classifier models with Decision Trees and Naive Bayes, with accuracy of 67%. We deploy a classifier to automatically classify sluice occurrences in OpenSubtitles, resulting in a corpus with 1.7 million occurrences. This will support empirical research into sluicing in dialogue, and it will also make it possible to build NLP systems using very large datasets. This is a noisy dataset; based on a small manually annotated sample, we found that only 80% of instances are in fact sluices, and the accuracy of sluice classification is lower. Despite this, the corpus can be of great use in research on sluicing and development of systems, and we are making the corpus freely available on request. Furthermore, we are in the process of improving the accuracy of sluice identification and annotation for the purpose of created a subsequent version of this corpus.",{"paper_id":5449,"title":5450,"year":81,"month":855,"day":63,"doi":5451,"resource_url":5452,"first_page":63,"last_page":63,"pdf_url":5453,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5454,"paper_type":860,"authors":5455,"abstract":5476},"lrec2018-main-250","Collection of Multimodal Dialog Data and Analysis of the Result of Annotation of Users’ Interest Level","10.63317\u002F598vipg7kbwt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-250","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F270.pdf","araki-etal-2018-collection",[5456,5458,5461,5464,5467,5470,5473],{"paper_id":5449,"author_seq":247,"given_name":4447,"surname":5457,"affiliation":63,"orcid":63},"Araki",{"paper_id":5449,"author_seq":232,"given_name":5459,"surname":5460,"affiliation":63,"orcid":63},"Sayaka","Tomimasu",{"paper_id":5449,"author_seq":218,"given_name":5462,"surname":5463,"affiliation":63,"orcid":63},"Mikio","Nakano",{"paper_id":5449,"author_seq":203,"given_name":5465,"surname":5466,"affiliation":63,"orcid":63},"Kazunori","Komatani",{"paper_id":5449,"author_seq":188,"given_name":5468,"surname":5469,"affiliation":63,"orcid":63},"Shogo","Okada",{"paper_id":5449,"author_seq":172,"given_name":5471,"surname":5472,"affiliation":63,"orcid":63},"Shinya","Fujie",{"paper_id":5449,"author_seq":155,"given_name":5474,"surname":5475,"affiliation":63,"orcid":63},"Hiroaki","Sugiyama","The Human-System Multimodal Dialogue Sharing Corpus Building Group is acting as a working group of SIG-SLUD for the purpose of constructing a corpus for evaluating elemental technologies of the multimodal dialogue system. In this paper, we report the results of recording chat dialogue data between a human and a virtual agent by the Wizard of OZ method conducted in 2016, and the results of the analysis of annotations of users' interest level in the data.",{"paper_id":5478,"title":5479,"year":81,"month":855,"day":63,"doi":5480,"resource_url":5481,"first_page":63,"last_page":63,"pdf_url":5482,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5483,"paper_type":860,"authors":5484,"abstract":5501},"lrec2018-main-251","Recognizing Behavioral Factors while Driving: A Real-World Multimodal Corpus to Monitor the Driver’s Affective State","10.63317\u002F3nvx9vvwg8fv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-251","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F513.pdf","lotz-etal-2018-recognizing",[5485,5488,5491,5494,5497,5499],{"paper_id":5478,"author_seq":247,"given_name":5486,"surname":5487,"affiliation":63,"orcid":63},"Alicia","Lotz",{"paper_id":5478,"author_seq":232,"given_name":5489,"surname":5490,"affiliation":63,"orcid":63},"Klas","Ihme",{"paper_id":5478,"author_seq":218,"given_name":5492,"surname":5493,"affiliation":63,"orcid":63},"Audrey","Charnoz",{"paper_id":5478,"author_seq":203,"given_name":5495,"surname":5496,"affiliation":63,"orcid":63},"Pantelis","Maroudis",{"paper_id":5478,"author_seq":188,"given_name":3755,"surname":5498,"affiliation":63,"orcid":63},"Dmitriev",{"paper_id":5478,"author_seq":172,"given_name":3238,"surname":5500,"affiliation":63,"orcid":63},"Wendemuth","The presented study concentrates on the collection of emotional multimodal real-world in-car audio, video and physiological signal recordings while driving. To do so, three sensor systems were integrated in the car and four relevant emotional states of the driver were defined: neutral, positive, frustrated and anxious. To gather as natural as possible emotional data of the driver, the subjects needed to be unbiased and were therefore kept unaware of the detailed research objective. The emotions were induced using so-called Wizard-of-Oz experiments, where the drivers believed to be interacting with an automated technical system, which in fact was controlled by a human. Additionally, on board interviews while driving were conducted by an instructed psychologist. To evaluate the collected data, questionnaires were filled out by the subjects before, during and after the data collection. These included monitoring of the drivers perceived state of emotion, stress, sleepiness and thermal sensation but also detailed questionnaires on their driving experience, attitude towards technology and big five OCEAN personality traits. Afterwards, the data was annotated by expert labelers. Exemplary results of the evaluation of the experiments are given in the result section of this paper. They indicate that the emotional states were successfully induced and the annotation results are consistent for both performed annotation approaches.",{"paper_id":5503,"title":5504,"year":81,"month":855,"day":63,"doi":5505,"resource_url":5506,"first_page":63,"last_page":63,"pdf_url":5507,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5508,"paper_type":860,"authors":5509,"abstract":5522},"lrec2018-main-252","EmotionLines: An Emotion Corpus of Multi-Party Conversations","10.63317\u002F2ae3p3x5wnc7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-252","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F581.pdf","hsu-etal-2018-emotionlines",[5510,5513,5515,5517,5519],{"paper_id":5503,"author_seq":247,"given_name":5511,"surname":5512,"affiliation":63,"orcid":63},"Chao-Chun","Hsu",{"paper_id":5503,"author_seq":232,"given_name":5514,"surname":2401,"affiliation":63,"orcid":63},"Sheng-Yeh",{"paper_id":5503,"author_seq":218,"given_name":5516,"surname":1612,"affiliation":63,"orcid":63},"Chuan-Chun",{"paper_id":5503,"author_seq":203,"given_name":5518,"surname":3506,"affiliation":63,"orcid":63},"Ting-Hao",{"paper_id":5503,"author_seq":188,"given_name":5520,"surname":5521,"affiliation":63,"orcid":63},"Lun-Wei","Ku","Feeling emotion is a critical characteristic to distinguish people from machines. Among all the multi-modal resources for emotion detection, textual datasets are those containing the least additional information in addition to semantics, and hence are adopted widely for testing the developed systems. However, most of the textual emotional datasets consist of emotion labels of only individual words, sentences or documents, which makes it challenging to discuss the contextual flow of emotions. In this paper, we introduce EmotionLines, the first dataset with emotions labeling on all utterances in each dialogue only based on their textual content. Dialogues in EmotionLines are collected from Friends TV scripts and private Facebook messenger dialogues. Then one of seven emotions, six Ekman’s basic emotions plus the neutral emotion, is labeled on each utterance by 5 Amazon MTurkers. A total of 29,245 utterances from 2,000 dialogues are labeled in EmotionLines. We also provide several strong baselines for emotion detection models on EmotionLines in this paper.",{"paper_id":5524,"title":5525,"year":81,"month":855,"day":63,"doi":5526,"resource_url":5527,"first_page":63,"last_page":63,"pdf_url":5528,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5529,"paper_type":860,"authors":5530,"abstract":5537},"lrec2018-main-253","Academic-Industrial Perspective on the Development and Deployment of a Moderation System for a Newspaper Website","10.63317\u002F3imw2n9hywta","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-253","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8885.pdf","schabus-skowron-2018-academic",[5531,5534],{"paper_id":5524,"author_seq":247,"given_name":5532,"surname":5533,"affiliation":63,"orcid":63},"Dietmar","Schabus",{"paper_id":5524,"author_seq":232,"given_name":5535,"surname":5536,"affiliation":63,"orcid":63},"Marcin","Skowron","This paper describes an approach and our experiences from the development, deployment and usability testing of a Natural Language Processing (NLP) and Information Retrieval system that supports the moderation of user comments on a large newspaper website. We highlight some of the differences between industry-oriented and academic research settings and their influence on the decisions made in the data collection and annotation processes, selection of document representation and machine learning methods. We report on classification results, where the problems to solve and the data to work with come from a commercial enterprise. In this context typical for NLP research, we discuss relevant industrial aspects. We believe that the challenges faced as well as the solutions proposed for addressing them can provide insights to others working in a similar setting.",{"paper_id":5539,"title":5540,"year":81,"month":855,"day":63,"doi":5541,"resource_url":5542,"first_page":63,"last_page":63,"pdf_url":5543,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5544,"paper_type":860,"authors":5545,"abstract":5555},"lrec2018-main-254","Community-Driven Crowdsourcing: Data Collection with Local Developers","10.63317\u002F4546pitg6fa2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-254","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8881.pdf","funk-etal-2018-community",[5546,5548,5549,5552],{"paper_id":5539,"author_seq":247,"given_name":4562,"surname":5547,"affiliation":63,"orcid":63},"Funk",{"paper_id":5539,"author_seq":232,"given_name":1780,"surname":4689,"affiliation":63,"orcid":63},{"paper_id":5539,"author_seq":218,"given_name":5550,"surname":5551,"affiliation":63,"orcid":63},"Ravindran","Rajakumar",{"paper_id":5539,"author_seq":203,"given_name":5553,"surname":5554,"affiliation":63,"orcid":63},"Linne","Ha","We tested the viability of partnering with local developers to create custom annotation applications and to recruit and motivate crowd contributors from their communities to perform an annotation task consisting of the assignment of toxicity ratings to Wikipedia comments. We discuss the background of the project, the design of the community-driven approach, the developers’ execution of their applications and crowdsourcing programs, and the quantity, quality, and cost of judgments, in comparison with previous approaches. The community-driven approach resulted in local developers successfully creating four unique tools and collecting labeled data of sufficiently high quantity and quality. The creative approaches to the rating task presentation and crowdsourcing program design drew upon developers’ local knowledge of their own social networks, who also reported interest in the underlying problem that the data collection addresses. We consider the lessons that may be drawn from this project for implementing future iterations of the community-driven approach.",{"paper_id":5557,"title":5558,"year":81,"month":855,"day":63,"doi":5559,"resource_url":5560,"first_page":63,"last_page":63,"pdf_url":5561,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5562,"paper_type":860,"authors":5563,"abstract":5586},"lrec2018-main-255","Building Open Javanese and Sundanese Corpora for Multilingual Text-to-Speech","10.63317\u002F4iw9raqzmnyn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-255","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8888.pdf","wibawa-etal-2018-building",[5564,5567,5570,5572,5575,5578,5581,5583,5585],{"paper_id":5557,"author_seq":247,"given_name":5565,"surname":5566,"affiliation":63,"orcid":63},"Jaka Aris Eko","Wibawa",{"paper_id":5557,"author_seq":232,"given_name":5568,"surname":5569,"affiliation":63,"orcid":63},"Supheakmungkol","Sarin",{"paper_id":5557,"author_seq":218,"given_name":5571,"surname":1591,"affiliation":63,"orcid":63},"Chenfang",{"paper_id":5557,"author_seq":203,"given_name":5573,"surname":5574,"affiliation":63,"orcid":63},"Knot","Pipatsrisawat",{"paper_id":5557,"author_seq":188,"given_name":5576,"surname":5577,"affiliation":63,"orcid":63},"Keshan","Sodimana",{"paper_id":5557,"author_seq":172,"given_name":5579,"surname":5580,"affiliation":63,"orcid":63},"Oddur","Kjartansson",{"paper_id":5557,"author_seq":155,"given_name":2736,"surname":5582,"affiliation":63,"orcid":63},"Gutkin",{"paper_id":5557,"author_seq":138,"given_name":1248,"surname":5584,"affiliation":63,"orcid":63},"Jansche",{"paper_id":5557,"author_seq":121,"given_name":5553,"surname":5554,"affiliation":63,"orcid":63},"We present multi-speaker text-to-speech corpora for Javanese and Sundanese, the second and third largest languages of Indonesia spoken by well over a hundred million people. The key objectives were to collect high-quality data in an affordable way and to share the data publicly with the speech community. To achieve this, we collaborated with two local universities in Java and streamlined our recording and crowdsourcing processes to produce corpora consisting of 5,800 (Javanese) and 4,200 (Sundanese) mixed-gender recordings. We used these corpora to build several configurations of multi-speaker neural network-based text-to-speech systems for Javanese and Sundanese. Subjective evaluations performed on these configurations demonstrate that multilingual configurations for which Javanese and Sundanese are trained jointly with a larger corpus of Standard Indonesian significantly outperform the systems constructed from a single language. We hope that sharing these corpora publicly and presenting our multilingual approach to text-to-speech will help the community to scale up text-to-speech technologies to other lesser resourced languages of Indonesia.",{"paper_id":5588,"title":5589,"year":81,"month":855,"day":63,"doi":5590,"resource_url":5591,"first_page":63,"last_page":63,"pdf_url":5592,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5593,"paper_type":860,"authors":5594,"abstract":5604},"lrec2018-main-256","An Integrated Representation of Linguistic and Social Functions of Code-Switching","10.63317\u002F38abmzhkyrxq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-256","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F392.pdf","hartmann-etal-2018-integrated",[5595,5598,5601],{"paper_id":5588,"author_seq":247,"given_name":5596,"surname":5597,"affiliation":63,"orcid":63},"Silvana","Hartmann",{"paper_id":5588,"author_seq":232,"given_name":5599,"surname":5600,"affiliation":63,"orcid":63},"Monojit","Choudhury",{"paper_id":5588,"author_seq":218,"given_name":5602,"surname":5603,"affiliation":63,"orcid":63},"Kalika","Bali","We present an integrated representation of code-switching (CS) functions, i.e., a representation that includes various CS phenomena (intra-\u002Finter-sentential) and modalities (written\u002Fspoken), and aims to derive CS functions from local and global properties of the code-switched discourse. By applying it to several English\u002FHindi CS datasets, we show that our model contributes i) to the standardization and re-use of CS data collections by creating a resource footprint, and ii) to the study of CS functions by creating a systematic description and hierarchy of reported functions together with the (local and social) properties that may affect them. At the same time, the model provides a flexible framework to add emerging functions, supporting theoretical studies as well as the automatic detection of CS functions.",{"paper_id":5606,"title":5607,"year":81,"month":855,"day":63,"doi":5608,"resource_url":5609,"first_page":63,"last_page":63,"pdf_url":5610,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5611,"paper_type":860,"authors":5612,"abstract":5618},"lrec2018-main-257","A Corpus of eRulemaking User Comments for Measuring Evaluability of Arguments","10.63317\u002F5p3rqiyyc6mq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-257","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F679.pdf","park-cardie-2018-corpus",[5613,5616],{"paper_id":5606,"author_seq":247,"given_name":5614,"surname":5615,"affiliation":63,"orcid":63},"Joonsuk","Park",{"paper_id":5606,"author_seq":232,"given_name":1217,"surname":5617,"affiliation":63,"orcid":63},"Cardie","eRulemaking is a means for government agencies to directly reach citizens to solicit their opinions and experiences regarding newly proposed rules. The effort, however, is partly hampered by citizens' comments that lack reasoning and evidence, which are largely ignored since government agencies are unable to evaluate the validity and strength. We present Cornell eRulemaking Corpus -- CDCP, an argument mining corpus annotated with argumentative structure information capturing the evaluability of arguments. The corpus consists of 731 user comments on Consumer Debt Collection Practices (CDCP) rule by the Consumer Financial Protection Bureau (CFPB); the resulting dataset contains 4931 elementary unit and 1221 support relation annotations. It is a resource for building argument mining systems that can not only extract arguments from unstructured text, but also identify what additional information is necessary for readers to understand and evaluate a given argument. Immediate applications include providing real-time feedback to commenters, specifying which types of support for which propositions can be added to construct better-formed arguments.",{"paper_id":5620,"title":5621,"year":81,"month":855,"day":63,"doi":5622,"resource_url":5623,"first_page":63,"last_page":63,"pdf_url":5624,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5625,"paper_type":860,"authors":5626,"abstract":5642},"lrec2018-main-258","A Multi-layer Annotated Corpus of Argumentative Text: From Argument Schemes to Discourse Relations","10.63317\u002F4rf82w2rkdx3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-258","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F717.pdf","musi-etal-2018-multi",[5627,5629,5632,5634,5637,5640],{"paper_id":5620,"author_seq":247,"given_name":3684,"surname":5628,"affiliation":63,"orcid":63},"Musi",{"paper_id":5620,"author_seq":232,"given_name":5630,"surname":5631,"affiliation":63,"orcid":63},"Tariq","Alhindi",{"paper_id":5620,"author_seq":218,"given_name":1058,"surname":5633,"affiliation":63,"orcid":63},"Stede",{"paper_id":5620,"author_seq":203,"given_name":5635,"surname":5636,"affiliation":63,"orcid":63},"Leonard","Kriese",{"paper_id":5620,"author_seq":188,"given_name":5638,"surname":5639,"affiliation":63,"orcid":63},"Smaranda","Muresan",{"paper_id":5620,"author_seq":172,"given_name":2516,"surname":5641,"affiliation":63,"orcid":63},"Rocci","We present a multi-layer annotated corpus of 112 argumentative microtexts encompassing not only argument structure and discourse relations (Stede et al., 2016), but also argument schemes — the inferential relations linking premises to claims. We propose a set of guidelines for the annotation of argument schemes both for support and attack relations, and a new user-friendly annotation tool. The multi-layer annotated corpus allows us to conduct an initial study of dependencies between discourse relations (according to Rhetorical Structure Theory (Mann and Thompson, 1988)) and argument schemes. Our main contribution is that of offering the first resource for the combined study of (argumentative) discourse relations and inferential moves.",{"paper_id":5644,"title":5645,"year":81,"month":855,"day":63,"doi":5646,"resource_url":5647,"first_page":63,"last_page":63,"pdf_url":5648,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5649,"paper_type":860,"authors":5650,"abstract":5655},"lrec2018-main-259","Discourse Coherence Through the Lens of an Annotated Text Corpus: A Case Study","10.63317\u002F383d5gmjyb9x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-259","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F20.pdf","hajicova-mirovsky-2018-discourse",[5651,5652],{"paper_id":5644,"author_seq":247,"given_name":890,"surname":3460,"affiliation":63,"orcid":63},{"paper_id":5644,"author_seq":232,"given_name":5653,"surname":5654,"affiliation":63,"orcid":63},"Jiří","Mírovský","A corpus-based study of local coherence as established by anaphoric links between the elements in the thematic (Topic) and the rhematic (Focus) parts of sentences in different genres of discourse. The study uses the Czech data present in the Prague Dependency Treebank and annotated for surface and underlying syntactic relations, the contextual boundness of tree nodes (from which the bi-partition of the sentence into Topic and Focus can be derived) and the coreference and bridging relations. Among the four possible types of the relations between anaphoric links and the Topic–Focus bipartition of the sentence, the most frequently occurring type is a link between the Topic of the sentence to the Focus of the immediately preceding sentence. In case there is an anaphoric link leading from the Focus of one sentence to the Topic or Focus of the immediately preceding sentence, this link frequently leads  from a contextually bound element of the Focus, which supports the assumption that it is convenient to distinguish between “overall” Topic and Focus and the local Topic and Focus and\u002For the anaphoric relation is of the type of bridging and the relationship is often interpreted as a contrast. As for the relationship between the relations of the Topic-to-Topic type, due to the word order typological difference for Czech and English, these relations in Czech are not at all related to the syntactic function of subject.",{"paper_id":5657,"title":5658,"year":81,"month":855,"day":63,"doi":5659,"resource_url":5660,"first_page":63,"last_page":63,"pdf_url":5661,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5662,"paper_type":860,"authors":5663,"abstract":5675},"lrec2018-main-260","Automatic Prediction of Discourse Connectives","10.63317\u002F3ztvcigon75v","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-260","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F203.pdf","malmi-etal-2018-automatic",[5664,5667,5670,5673],{"paper_id":5657,"author_seq":247,"given_name":5665,"surname":5666,"affiliation":63,"orcid":63},"Eric","Malmi",{"paper_id":5657,"author_seq":232,"given_name":5668,"surname":5669,"affiliation":63,"orcid":63},"Daniele","Pighin",{"paper_id":5657,"author_seq":218,"given_name":5671,"surname":5672,"affiliation":63,"orcid":63},"Sebastian","Krause",{"paper_id":5657,"author_seq":203,"given_name":2876,"surname":5674,"affiliation":63,"orcid":63},"Kozhevnikov","Accurate prediction of suitable discourse connectives (however, furthermore, etc.) is a key component of any system aimed at building coherent and fluent discourses from shorter sentences and passages. As an example, a dialog system might assemble a long and informative answer by sampling passages extracted from different documents retrieved from the Web. We formulate the task of discourse connective prediction and release a dataset of 2.9M sentence pairs separated by discourse connectives for this task. Then, we evaluate the hardness of the task for human raters, apply a recently proposed decomposable attention (DA) model to this task and observe that the automatic predictor has a higher F1 than human raters (32 vs. 30).  Nevertheless, under specific conditions the raters still outperform the DA model, suggesting that there is headroom for future improvements.",{"paper_id":5677,"title":5678,"year":81,"month":855,"day":63,"doi":5679,"resource_url":5680,"first_page":63,"last_page":63,"pdf_url":5681,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5682,"paper_type":860,"authors":5683,"abstract":5699},"lrec2018-main-261","Handling Rare Word Problem using Synthetic Training Data for Sinhala and Tamil Neural Machine Translation","10.63317\u002F4w4hdc3gg225","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-261","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F949.pdf","tennage-etal-2018-handling",[5684,5687,5690,5693,5696],{"paper_id":5677,"author_seq":247,"given_name":5685,"surname":5686,"affiliation":63,"orcid":63},"Pasindu","Tennage",{"paper_id":5677,"author_seq":232,"given_name":5688,"surname":5689,"affiliation":63,"orcid":63},"Prabath","Sandaruwan",{"paper_id":5677,"author_seq":218,"given_name":5691,"surname":5692,"affiliation":63,"orcid":63},"Malith","Thilakarathne",{"paper_id":5677,"author_seq":203,"given_name":5694,"surname":5695,"affiliation":63,"orcid":63},"Achini","Herath",{"paper_id":5677,"author_seq":188,"given_name":5697,"surname":5698,"affiliation":63,"orcid":63},"Surangika","Ranathunga","Lack of parallel training data influences the rare word problem in Neural Machine Translation (NMT) systems, particularly for under-resourced languages. Using synthetic parallel training data (data augmentation) is a promising approach to handle the rare word problem. Previously proposed methods for data augmentation do not consider language syntax when generating synthetic training data. This leads to generation of sentences that lower the overall quality of parallel training data. In this paper, we discuss the suitability of using Parts of Speech (POS) tagging and morphological analysis as syntactic features to prune the generated synthetic sentence pairs that do not adhere to language syntax.  Our models show an overall 2.16 and 5.00 BLEU score gains over our benchmark Sinhala to Tamil and Tamil to Sinhala translation systems, respectively. Although we focus on Sinhala and Tamil NMT for the domain of official government documents, we believe that these synthetic data pruning techniques can be generalized to any language pair.",{"paper_id":5701,"title":5702,"year":81,"month":855,"day":63,"doi":5703,"resource_url":5704,"first_page":63,"last_page":63,"pdf_url":5705,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5706,"paper_type":860,"authors":5707,"abstract":5718},"lrec2018-main-262","BDPROTO: A Database of Phonological Inventories from Ancient and Reconstructed Languages","10.63317\u002F4snupxfryrhj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-262","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F534.pdf","marsico-etal-2018-bdproto",[5708,5711,5714,5717],{"paper_id":5701,"author_seq":247,"given_name":5709,"surname":5710,"affiliation":63,"orcid":63},"Egidio","Marsico",{"paper_id":5701,"author_seq":232,"given_name":5712,"surname":5713,"affiliation":63,"orcid":63},"Sebastien","Flavier",{"paper_id":5701,"author_seq":218,"given_name":5715,"surname":5716,"affiliation":63,"orcid":63},"Annemarie","Verkerk",{"paper_id":5701,"author_seq":203,"given_name":1085,"surname":3346,"affiliation":63,"orcid":63},"Here we present BDPROTO, a database comprised of phonological inventory data from 137 ancient and reconstructed languages. These data were extracted from historical linguistic reconstructions and brought together into a single unified, normalized, accessible, and Unicode-compliant language resource. This dataset is publicly available and we aim to engage language scientists doing research on language change and language evolution. We provide a short case study to highlight BDPROTO's research viability; using phylogenetic comparative methods and high-resolution language family trees, we investigate whether consonantal and vocalic systems differ in their rates of change over the last 10,000 years.",{"paper_id":5720,"title":5721,"year":81,"month":855,"day":63,"doi":5722,"resource_url":5723,"first_page":63,"last_page":63,"pdf_url":5724,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5725,"paper_type":860,"authors":5726,"abstract":5732},"lrec2018-main-263","Creating a Translation Matrix of the Bible’s Names Across 591 Languages","10.63317\u002F2nay3zhe8dar","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-263","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F770.pdf","wu-etal-2018-creating",[5727,5728,5731],{"paper_id":5720,"author_seq":247,"given_name":3702,"surname":3703,"affiliation":63,"orcid":63},{"paper_id":5720,"author_seq":232,"given_name":5729,"surname":5730,"affiliation":63,"orcid":63},"Nidhi","Vyas",{"paper_id":5720,"author_seq":218,"given_name":1199,"surname":3705,"affiliation":63,"orcid":63},"For many of the world's languages, the Bible is the only significant bilingual, or even monolingual, text, making it a unique training resource for tasks such as translation, named entity analysis, and transliteration. Given the Bible's small size, however, the output of standard word alignment tools can be extremely noisy, making downstream tasks difficult. In this work, we develop and release a novel resource of 1129 aligned Bible person and place names across 591 languages, which was constructed and improved using several approaches including weighted edit distance, machine-translation-based transliteration models, and affixal induction and transformation models. Our models outperform a widely used word aligner on 97% of test words, showing the particular efficacy of our approach on the impactful task of broadly multilingual named-entity alignment and translation across a remarkably large number of world languages. We further illustrate the utility of our translation matrix for the multilingual learning of name-related affixes and their semantics as well as transliteration of named entities.",{"paper_id":5734,"title":5735,"year":81,"month":855,"day":63,"doi":5736,"resource_url":5737,"first_page":63,"last_page":63,"pdf_url":5738,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5739,"paper_type":860,"authors":5740,"abstract":5755},"lrec2018-main-264","Building a Word Segmenter for Sanskrit Overnight","10.63317\u002F4xp7pwb4iff6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-264","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F669.pdf","reddy-etal-2018-building",[5741,5744,5747,5749,5751,5754],{"paper_id":5734,"author_seq":247,"given_name":5742,"surname":5743,"affiliation":63,"orcid":63},"Vikas","Reddy",{"paper_id":5734,"author_seq":232,"given_name":5745,"surname":5746,"affiliation":63,"orcid":63},"Amrith","Krishna",{"paper_id":5734,"author_seq":218,"given_name":5748,"surname":1847,"affiliation":63,"orcid":63},"Vishnu",{"paper_id":5734,"author_seq":203,"given_name":5750,"surname":2603,"affiliation":63,"orcid":63},"Prateek",{"paper_id":5734,"author_seq":188,"given_name":5752,"surname":5753,"affiliation":63,"orcid":63},"Vineeth","M R",{"paper_id":5734,"author_seq":172,"given_name":964,"surname":965,"affiliation":63,"orcid":63},"There is an abundance of digitised texts available in Sanskrit. However, the word segmentation task in such texts are challenging due to the issue of 'Sandhi'. In Sandhi, words in a sentence often fuse together to form a single chunk of text, where the word delimiter vanishes and sounds at the word boundaries undergo transformations, which is also reflected in the written text. Here, we propose an approach that uses a deep sequence to sequence (seq2seq) model that takes only the sandhied string as the input and predicts the unsandhied string. The state of the art models are linguistically involved and have external dependencies for the lexical and morphological analysis of the input. Our model can be trained \"overnight\" and be used for production. In spite of the knowledge-lean approach, our system performs better than the current state of the art by gaining a percentage increase of  16.79 % than the current state of the art.",{"paper_id":5757,"title":5758,"year":81,"month":855,"day":63,"doi":5759,"resource_url":5760,"first_page":63,"last_page":63,"pdf_url":5761,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5762,"paper_type":860,"authors":5763,"abstract":5772},"lrec2018-main-265","Simple Semantic Annotation and Situation Frames: Two Approaches to Basic Text Understanding in LORELEI","10.63317\u002F2pkkqqd5y8cr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-265","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1020.pdf","griffitt-etal-2018-simple",[5764,5767,5768,5771],{"paper_id":5757,"author_seq":247,"given_name":5765,"surname":5766,"affiliation":63,"orcid":63},"Kira","Griffitt",{"paper_id":5757,"author_seq":232,"given_name":4518,"surname":5381,"affiliation":63,"orcid":63},{"paper_id":5757,"author_seq":218,"given_name":5769,"surname":5770,"affiliation":63,"orcid":63},"Ann","Bies",{"paper_id":5757,"author_seq":203,"given_name":1205,"surname":5377,"affiliation":63,"orcid":63},"We present two types of semantic annotation developed for the DARPA Low Resource Languages for Emerging Incidents (LORELEI) program: Simple Semantic Annotation (SSA) and Situation Frames (SF). Both of these annotation approaches are concerned with labeling basic semantic information relevant to humanitarian aid and disaster relief (HADR) scenarios, with SSA serving as a more general resource and SF more directly supporting the evaluation of LORELEI technology. Mapping between information in different annotation tasks is an area of ongoing research for both system developers and data providers. We discuss the similarities and differences between the two types of LORELEI semantic annotation, along with ways in which the general semantic information captured in SSA can be leveraged in order to recognize HADR-oriented information captured by SF. To date we have produced annotations for nineteen LORELEI languages; by the program’s end both SF and SSA will be available for over two dozen typologically diverse languages. Initially data is provided to LORELEI performers and to participants in NIST’s Low Resource Human Language Technologies (LoReHLT) evaluation series. After their use in LORELEI and LoReHLT evaluations the data sets will be published in the LDC catalog.",{"paper_id":5774,"title":5775,"year":81,"month":855,"day":63,"doi":5776,"resource_url":5777,"first_page":63,"last_page":63,"pdf_url":5778,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5779,"paper_type":860,"authors":5780,"abstract":5793},"lrec2018-main-266","Abstract Meaning Representation of Constructions: The More We Include, the Better the Representation","10.63317\u002F2jz9ots2wpo3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-266","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F856.pdf","bonial-etal-2018-abstract",[5781,5782,5785,5786,5789,5790,5791,5792],{"paper_id":5774,"author_seq":247,"given_name":1217,"surname":1218,"affiliation":63,"orcid":63},{"paper_id":5774,"author_seq":232,"given_name":5783,"surname":5784,"affiliation":63,"orcid":63},"Bianca","Badarau",{"paper_id":5774,"author_seq":218,"given_name":5765,"surname":5766,"affiliation":63,"orcid":63},{"paper_id":5774,"author_seq":203,"given_name":5787,"surname":5788,"affiliation":63,"orcid":63},"Ulf","Hermjakob",{"paper_id":5774,"author_seq":188,"given_name":2184,"surname":3836,"affiliation":63,"orcid":63},{"paper_id":5774,"author_seq":172,"given_name":5126,"surname":5127,"affiliation":63,"orcid":63},{"paper_id":5774,"author_seq":155,"given_name":1022,"surname":1023,"affiliation":63,"orcid":63},{"paper_id":5774,"author_seq":138,"given_name":5330,"surname":5331,"affiliation":63,"orcid":63},"We describe the expansion of the Abstract Meaning Representation (AMR) project to provide coverage for the annotation of certain types of constructions. Past AMR annotations generally followed a practice of assigning the semantic roles associated with an individual lexical item, as opposed to a flexible pattern or template of multiple lexical items, which characterizes constructions such as ‘The X-er, The Y-er’ (exemplified in the title). Furthermore, a goal of AMR is to provide consistent semantic representation despite language-specific syntactic idiosyncracies. Thus, representing the meanings associated with fully syntactic patterns required a novel annotation approach. As one strategy in our approach, we expanded the AMR lexicon of predicate senses, or semantic ‘rolesets,’ to include entries for a growing set of constructions. Despite the challenging practical and theoretical questions encountered, the additions and updates to AMR annotation described here ensure more comprehensive semantic representations capturing both lexical and constructional meaning.",{"paper_id":5795,"title":5796,"year":81,"month":855,"day":63,"doi":5797,"resource_url":5798,"first_page":63,"last_page":63,"pdf_url":5799,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5800,"paper_type":860,"authors":5801,"abstract":5812},"lrec2018-main-267","Evaluating Scoped Meaning Representations","10.63317\u002F2hgnj3ivug8f","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-267","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F553.pdf","van-noord-etal-2018-evaluating",[5802,5803,5806,5809],{"paper_id":5795,"author_seq":247,"given_name":3014,"surname":3015,"affiliation":63,"orcid":63},{"paper_id":5795,"author_seq":232,"given_name":5804,"surname":5805,"affiliation":63,"orcid":63},"Lasha","Abzianidze",{"paper_id":5795,"author_seq":218,"given_name":5807,"surname":5808,"affiliation":63,"orcid":63},"Hessel","Haagsma",{"paper_id":5795,"author_seq":203,"given_name":5810,"surname":5811,"affiliation":63,"orcid":63},"Johan","Bos","Semantic parsing offers many opportunities to improve natural language understanding. We present a semantically annotated parallel corpus for English, German, Italian, and Dutch where sentences are aligned with scoped meaning representations in order to capture the semantics of negation, modals, quantification, and presupposition triggers. The semantic formalism is based on Discourse Representation Theory, but concepts are represented by WordNet synsets and thematic roles by VerbNet relations. Translating scoped meaning representations to sets of clauses enables us to compare them for the purpose of semantic parser evaluation and checking translations. This is done by computing precision and recall on matching clauses, in a similar way as is done for Abstract Meaning Representations. We show that our matching tool for evaluating scoped meaning representations is both accurate and efficient. Applying this matching tool to three baseline semantic parsers yields F-scores between 43% and 54%. A pilot study is performed to automatically find changes in meaning by comparing meaning representations of translations. This comparison turns out to be an additional way of (i) finding annotation mistakes and (ii) finding instances where our semantic analysis needs to be improved.",{"paper_id":5814,"title":5815,"year":81,"month":855,"day":63,"doi":5816,"resource_url":5817,"first_page":63,"last_page":63,"pdf_url":5818,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5819,"paper_type":860,"authors":5820,"abstract":5828},"lrec2018-main-268","Huge Automatically Extracted Training-Sets for Multilingual Word SenseDisambiguation","10.63317\u002F4dxiepzhjumm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-268","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F731.pdf","pasini-etal-2018-huge",[5821,5823,5826],{"paper_id":5814,"author_seq":247,"given_name":1894,"surname":5822,"affiliation":63,"orcid":63},"Pasini",{"paper_id":5814,"author_seq":232,"given_name":5824,"surname":5825,"affiliation":63,"orcid":63},"Francesco","Elia",{"paper_id":5814,"author_seq":218,"given_name":2647,"surname":5827,"affiliation":63,"orcid":63},"Navigli","We release to the community six large-scale sense-annotated datasets in multiple language to pave the way for supervised multilingual Word Sense Disambiguation.  Our datasets cover all the nouns in the English WordNet and their translations in other languages for a total of millions of sense-tagged sentences. Experiments prove that these corpora can be effectively used as training sets for supervised WSD systems, surpassing the state of the art for low-resourced languages and providing competitive results for English, where manually annotated training sets are available. The data is available at trainomatic.org.",{"paper_id":5830,"title":5831,"year":81,"month":855,"day":63,"doi":5832,"resource_url":5833,"first_page":63,"last_page":63,"pdf_url":5834,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5835,"paper_type":860,"authors":5836,"abstract":5842},"lrec2018-main-269","SentEval: An Evaluation Toolkit for Universal Sentence Representations","10.63317\u002F2gscircifffd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-269","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F757.pdf","conneau-kiela-2018-senteval",[5837,5839],{"paper_id":5830,"author_seq":247,"given_name":1127,"surname":5838,"affiliation":63,"orcid":63},"Conneau",{"paper_id":5830,"author_seq":232,"given_name":5840,"surname":5841,"affiliation":63,"orcid":63},"Douwe","Kiela","We introduce SentEval, a toolkit for evaluating the quality of universal sentence representations. SentEval encompasses a variety of tasks, including binary and multi-class classification, natural language inference and sentence similarity. The set of tasks was selected based on what appears to be the community consensus regarding the appropriate evaluations for universal sentence representations. The toolkit comes with scripts to download and preprocess datasets, and an easy interface to evaluate sentence encoders. The aim is to provide a fairer, less cumbersome and more centralized way for evaluating sentence representations.",{"paper_id":5844,"title":5845,"year":81,"month":855,"day":63,"doi":5846,"resource_url":5847,"first_page":63,"last_page":63,"pdf_url":5848,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5849,"paper_type":860,"authors":5850,"abstract":5853},"lrec2018-main-270","A Survey on Automatically-Constructed WordNets and their Evaluation: Lexical and Word Embedding-based Approaches","10.63317\u002F4qoznsbmtcwi","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-270","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1030.pdf","neale-2018-survey",[5851],{"paper_id":5844,"author_seq":247,"given_name":1085,"surname":5852,"affiliation":63,"orcid":63},"Neale","WordNets - lexical databases in which groups of synonyms are arranged according to the semantic relationships between them - are crucial resources in semantically-focused natural language processing tasks, but are extremely costly and labour intensive to produce. In languages besides English, this has led to growing interest in constructing and extending WordNets automatically, as an alternative to producing them from scratch. This paper describes various approaches to constructing WordNets automatically - by leveraging traditional lexical resources and newer trends such as word embeddings - and also offers a discussion of the issues affecting the evaluation of automatically constructed WordNets.",{"paper_id":5855,"title":5856,"year":81,"month":855,"day":63,"doi":5857,"resource_url":5858,"first_page":63,"last_page":63,"pdf_url":5859,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5860,"paper_type":860,"authors":5861,"abstract":5869},"lrec2018-main-271","Linguistically-driven Framework for Computationally Efficient and Scalable Sign Recognition","10.63317\u002F3p4cvkgxp7ru","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-271","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1069.pdf","metaxas-etal-2018-linguistically",[5862,5864,5866],{"paper_id":5855,"author_seq":247,"given_name":3204,"surname":5863,"affiliation":63,"orcid":63},"Metaxas",{"paper_id":5855,"author_seq":232,"given_name":1364,"surname":5865,"affiliation":63,"orcid":63},"Dilsizian",{"paper_id":5855,"author_seq":218,"given_name":5867,"surname":5868,"affiliation":63,"orcid":63},"Carol","Neidle","We introduce a new general framework for sign recognition from monocular video using limited quantities of annotated data. The novelty of the hybrid framework we describe here is that we exploit state-of-the art learning methods while also incorporating features based on what we know about the linguistic composition of lexical signs. In particular, we analyze hand shape, orientation, location, and motion trajectories, and then use CRFs to combine this linguistically significant information for purposes of sign recognition. Our robust modeling and recognition of these sub-components of sign production allow an efficient parameterization of the sign recognition problem as compared with purely data-driven methods. This parameterization enables a scalable and extendable time-series learning approach that advances the state of the art in sign recognition, as shown by the results reported here for recognition of isolated, citation-form, lexical signs from American Sign Language (ASL).",{"paper_id":5871,"title":5872,"year":81,"month":855,"day":63,"doi":5873,"resource_url":5874,"first_page":63,"last_page":63,"pdf_url":5875,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5876,"paper_type":860,"authors":5877,"abstract":5885},"lrec2018-main-272","CONDUCT: An Expressive Conducting Gesture Dataset for Sound Control","10.63317\u002F2j9iv444rq4t","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-272","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1075.pdf","chen-etal-2018-conduct",[5878,5879,5882],{"paper_id":5871,"author_seq":247,"given_name":1221,"surname":2401,"affiliation":63,"orcid":63},{"paper_id":5871,"author_seq":232,"given_name":5880,"surname":5881,"affiliation":63,"orcid":63},"Sylvie","Gibet",{"paper_id":5871,"author_seq":218,"given_name":5883,"surname":5884,"affiliation":63,"orcid":63},"Camille","Marteau","Recent research in music-gesture relationship has paid more attention on the sound variations and its corresponding gesture expressiveness. In this study we are interested by  gestures performed by orchestral conductors, with a focus on the expressive gestures made by the non dominant hand. We make the assumption that these gestures convey some meaning shared by most of conductors, and that they implicitly correspond to sound effects which can be encoded in musical scores. Following this hypothesis, we defined a collection of gestures for musical direction. These gestures are designed to correspond to well known functional effect on sounds, and they can be modulated to vary this effect by simply modifying one of their structural component (hand movement or hand shape). This paper presents the design of the gesture and sound sets and the protocol that has led to the database construction. The relevant musical excerpts and the related expressive gestures have been first defined by one expert musician. The gestures were then recorded through motion capture by two non experts who performed them along with recorded music. This database will serve as a basis for training gesture recognition system for live sound control and modulation.",{"paper_id":5887,"title":5888,"year":81,"month":855,"day":63,"doi":5889,"resource_url":5890,"first_page":63,"last_page":63,"pdf_url":5891,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5892,"paper_type":860,"authors":5893,"abstract":5901},"lrec2018-main-273","Neural Caption Generation for News Images","10.63317\u002F38koo4iw6neu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-273","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F725.pdf","batra-etal-2018-neural",[5894,5897,5898],{"paper_id":5887,"author_seq":247,"given_name":5895,"surname":5896,"affiliation":63,"orcid":63},"Vishwash","Batra",{"paper_id":5887,"author_seq":232,"given_name":1424,"surname":1425,"affiliation":63,"orcid":63},{"paper_id":5887,"author_seq":218,"given_name":5899,"surname":5900,"affiliation":63,"orcid":63},"George","Vogiatzis","Automatic caption generation of images has gained significant interest. It gives rise to a lot of interesting image-related applications. For example, it could help in image\u002Fvideo retrieval and management of the vast amount of multimedia data available on the Internet. It could also help in the development of tools that can aid visually impaired individuals in accessing multimedia content. In this paper, we particularly focus on news images and propose a methodology for automatically generating captions for news paper articles consisting of a text paragraph and an image. We propose several deep neural network architectures built upon Recurrent Neural Networks. Results on a BBC News dataset show that our proposed approach outperforms a traditional method based on Latent Dirichlet Allocation using both automatic evaluation based on BLEU scores and human evaluation.",{"paper_id":5903,"title":5904,"year":81,"month":855,"day":63,"doi":5905,"resource_url":5906,"first_page":63,"last_page":63,"pdf_url":5907,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5908,"paper_type":860,"authors":5909,"abstract":5922},"lrec2018-main-274","MPST: A Corpus of Movie Plot Synopses with Tags","10.63317\u002F3wi3hsv4xp6j","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-274","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F332.pdf","kar-etal-2018-mpst",[5910,5913,5916,5919],{"paper_id":5903,"author_seq":247,"given_name":5911,"surname":5912,"affiliation":63,"orcid":63},"Sudipta","Kar",{"paper_id":5903,"author_seq":232,"given_name":5914,"surname":5915,"affiliation":63,"orcid":63},"Suraj","Maharjan",{"paper_id":5903,"author_seq":218,"given_name":5917,"surname":5918,"affiliation":63,"orcid":63},"A. Pastor","López-Monroy",{"paper_id":5903,"author_seq":203,"given_name":5920,"surname":5921,"affiliation":63,"orcid":63},"Thamar","Solorio","Social tagging of movies reveals a wide range of heterogeneous information about movies, like the genre, plot structure, soundtracks, metadata, visual and emotional experiences. Such information can be valuable in building automatic systems to create tags for movies. Automatic tagging systems can help recommendation engines to improve the retrieval of similar movies as well as help viewers to know what to expect from a movie in advance. In this paper, we set out to the task of collecting a corpus of movie plot synopses and tags. We describe a methodology that enabled us to build a fine-grained set of around 70 tags exposing heterogeneous characteristics of movie plots and the multi-label associations of these tags with some 14K movie plot synopses. We investigate how these tags correlate with movies and the flow of emotions throughout different types of movies. Finally, we use this corpus to explore the feasibility of inferring tags from plot synopses. We expect the corpus will be useful in other tasks where analysis of narratives is relevant.",{"paper_id":5924,"title":5925,"year":81,"month":855,"day":63,"doi":5926,"resource_url":5927,"first_page":63,"last_page":63,"pdf_url":5928,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5929,"paper_type":860,"authors":5930,"abstract":5939},"lrec2018-main-275","OpenSubtitles2018: Statistical Rescoring of Sentence Alignments in Large, Noisy Parallel Corpora","10.63317\u002F4cin6ute7f3b","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-275","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F294.pdf","lison-etal-2018-opensubtitles2018",[5931,5933,5936],{"paper_id":5924,"author_seq":247,"given_name":1388,"surname":5932,"affiliation":63,"orcid":63},"Lison",{"paper_id":5924,"author_seq":232,"given_name":5934,"surname":5935,"affiliation":63,"orcid":63},"Jörg","Tiedemann",{"paper_id":5924,"author_seq":218,"given_name":5937,"surname":5938,"affiliation":63,"orcid":63},"Milen","Kouylekov","Movie and TV subtitles are a highly valuable resource for the compilation of parallel corpora thanks to their availability in large numbers and across many languages. However, the quality of the resulting sentence alignments is often lower than for other parallel corpora.        This paper presents a new major release of the OpenSubtitles collection of parallel corpora, which is extracted from a total of 3.7 million subtitles spread over 60 languages. In addition to a substantial increase in the corpus size (about 30% compared to the previous version), this new release associates explicit quality scores to each sentence alignment.  These scores are determined by a feedforward neural network based on simple language-independent features and estimated on a sample of aligned sentence pairs.  Evaluation results show that the model is able predict lexical translation probabilities with a root mean square error of 0.07 (coefficient of determination R2 = 0.47). Based on the scores produced by this regression model, the parallel corpora can be filtered to prune out low-quality alignments.",{"paper_id":5941,"title":5942,"year":81,"month":855,"day":63,"doi":5943,"resource_url":5944,"first_page":63,"last_page":63,"pdf_url":5945,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5946,"paper_type":860,"authors":5947,"abstract":5962},"lrec2018-main-276","Building an Ellipsis-aware Chinese Dependency Treebank for Web Text","10.63317\u002F4zqurzns6pqh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-276","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F297.pdf","ren-etal-2018-building",[5948,5951,5952,5955,5957,5960],{"paper_id":5941,"author_seq":247,"given_name":5949,"surname":5950,"affiliation":63,"orcid":63},"Xuancheng","Ren",{"paper_id":5941,"author_seq":232,"given_name":1585,"surname":3774,"affiliation":63,"orcid":63},{"paper_id":5941,"author_seq":218,"given_name":5953,"surname":5954,"affiliation":63,"orcid":63},"Ji","Wen",{"paper_id":5941,"author_seq":203,"given_name":5956,"surname":2435,"affiliation":63,"orcid":63},"Bingzhen",{"paper_id":5941,"author_seq":188,"given_name":5958,"surname":5959,"affiliation":63,"orcid":63},"Weidong","Zhan",{"paper_id":5941,"author_seq":172,"given_name":5961,"surname":2381,"affiliation":63,"orcid":63},"Zhiyuan","Web 2.0 has brought with it numerous user-produced data revealing one's thoughts, experiences, and knowledge, which are a great source for many tasks, such as information extraction, and knowledge base construction. However, the colloquial nature of the texts poses new challenges for current natural language processing techniques, which are more adapt to the formal form of the language. Ellipsis is a common linguistic phenomenon that some words are left out as they are understood from the context, especially in oral utterance, hindering the improvement of dependency parsing, which is of great importance for tasks relied on the meaning of the sentence. In order to promote research in this area, we are releasing a Chinese dependency treebank of 319 weibos, containing 572 sentences with omissions restored and contexts reserved.",{"paper_id":5964,"title":5965,"year":81,"month":855,"day":63,"doi":5966,"resource_url":5967,"first_page":63,"last_page":63,"pdf_url":5968,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5969,"paper_type":860,"authors":5970,"abstract":5986},"lrec2018-main-277","EuroGames16: Evaluating Change Detection in Online Conversation","10.63317\u002F2jahucpbw5ba","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-277","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F335.pdf","goutte-etal-2018-eurogames16",[5971,5973,5975,5978,5981,5983],{"paper_id":5964,"author_seq":247,"given_name":1406,"surname":5972,"affiliation":63,"orcid":63},"Goutte",{"paper_id":5964,"author_seq":232,"given_name":5974,"surname":1588,"affiliation":63,"orcid":63},"Yunli",{"paper_id":5964,"author_seq":218,"given_name":5976,"surname":5977,"affiliation":63,"orcid":63},"Fangming","Liao",{"paper_id":5964,"author_seq":203,"given_name":5979,"surname":5980,"affiliation":63,"orcid":63},"Zachary","Zanussi",{"paper_id":5964,"author_seq":188,"given_name":1614,"surname":5982,"affiliation":63,"orcid":63},"Larkin",{"paper_id":5964,"author_seq":172,"given_name":5984,"surname":5985,"affiliation":63,"orcid":63},"Yuri","Grinberg","We introduce the challenging task of detecting changes from an online conversation. Our goal is to detect significant changes in, for example, sentiment or topic in a stream of messages that are part of an ongoing conversation. Our approach relies on first applying linguistic preprocessing or collecting simple statistics on the messages in the conversation in order to build a time series. Change point detection algorithms are then applied to identify the location of significant changes in the distribution of the underlying time series. We present a collection of sports events on which we can evaluate the performance of our change detection method. Our experiments, using several change point detection algorithms and several types of time series, show that it is possible to detect salient changes in an on-line conversation with relatively high accuracy.",{"paper_id":5988,"title":5989,"year":81,"month":855,"day":63,"doi":5990,"resource_url":5991,"first_page":63,"last_page":63,"pdf_url":5992,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":5993,"paper_type":860,"authors":5994,"abstract":5999},"lrec2018-main-278","A Deep Neural Network based Approach for Entity Extraction in Code-Mixed Indian Social Media Text","10.63317\u002F4kkwj57kxfv7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-278","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F486.pdf","gupta-etal-2018-deep",[5995,5997,5998],{"paper_id":5988,"author_seq":247,"given_name":5996,"surname":2603,"affiliation":63,"orcid":63},"Deepak",{"paper_id":5988,"author_seq":232,"given_name":1861,"surname":1862,"affiliation":63,"orcid":63},{"paper_id":5988,"author_seq":218,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},"The rise in accessibility of web to the masses has led to a spurt in the use of social media making it convenient and powerful way to express and exchange information in their own language(s). India, being enormously diversified country have more than 168 millions users on social media. This diversity is also reflected in their scripts where a majority of users often switch between their native language to be more expressive. These linguistic variations make automatic entity extraction both a necessary and a challenging problem. In this paper, we report our work for entity extraction in a code-mixed environment. Entity extraction is a fundamental component in many natural language processing (NLP) applications. The task of entity extraction faces more challenges while dealing with unstructured and informal texts, and mixing of scripts (i.e., code-mixing) further adds complexities to the process. Our proposed approach is based on the popular deep neural network based Gated Recurrent Unit (GRU) units that discover the higher level features from the text automatically. It does not require handcrafted features or rules, unlike the existing systems. To the best of our knowledge, it is the first attempt for entity extraction from code mixed data using the deep neural network. The proposed system achieves the F-scores of 66.04% and 53.85% for English-Hindi and English-Tamil language pairs, respectively.",{"paper_id":6001,"title":6002,"year":81,"month":855,"day":63,"doi":6003,"resource_url":6004,"first_page":63,"last_page":63,"pdf_url":6005,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6006,"paper_type":860,"authors":6007,"abstract":6023},"lrec2018-main-279","PoSTWITA-UD: an Italian Twitter Treebank in Universal Dependencies","10.63317\u002F4nmzdtvpzheh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-279","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F636.pdf","sanguinetti-etal-2018-postwita",[6008,6010,6013,6015,6017,6020],{"paper_id":6001,"author_seq":247,"given_name":2539,"surname":6009,"affiliation":63,"orcid":63},"Sanguinetti",{"paper_id":6001,"author_seq":232,"given_name":6011,"surname":6012,"affiliation":63,"orcid":63},"Cristina","Bosco",{"paper_id":6001,"author_seq":218,"given_name":2086,"surname":6014,"affiliation":63,"orcid":63},"Lavelli",{"paper_id":6001,"author_seq":203,"given_name":2769,"surname":6016,"affiliation":63,"orcid":63},"Mazzei",{"paper_id":6001,"author_seq":188,"given_name":6018,"surname":6019,"affiliation":63,"orcid":63},"Oronzo","Antonelli",{"paper_id":6001,"author_seq":172,"given_name":6021,"surname":6022,"affiliation":63,"orcid":63},"Fabio","Tamburini","Due to the spread of social media-based applications and the challenges posed by the treatment of social media texts in NLP tools, tailored approaches and ad hoc resources are required to provide the proper coverage of specific linguistic phenomena. Various attempts to produce this kind of specialized resources and tools are described in literature. However, most of these attempts mainly focus on PoS-tagged corpora and only a few of them deal with syntactic annotation. This is particularly true for the Italian language, for which such a resource is currently missing. We thus propose the development of PoSTWITA-UD, a collection of tweets annotated according to a well-known dependency-based annotation format: the Universal Dependencies. The goal of this work is manifold, and it mainly consists in creating a resource that, especially for Italian, can be exploited for the training of NLP systems so as to enhance their performance on social media texts. In this paper we focus on the current state of the resource.",{"paper_id":6025,"title":6026,"year":81,"month":855,"day":63,"doi":6027,"resource_url":6028,"first_page":63,"last_page":63,"pdf_url":6029,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6030,"paper_type":860,"authors":6031,"abstract":6036},"lrec2018-main-280","Annotating If the Authors of a Tweet are Located at the Locations They Tweet About","10.63317\u002F2ejjehecmszn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-280","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1011.pdf","doudagiri-etal-2018-annotating",[6032,6034,6035],{"paper_id":6025,"author_seq":247,"given_name":2565,"surname":6033,"affiliation":63,"orcid":63},"Doudagiri",{"paper_id":6025,"author_seq":232,"given_name":1909,"surname":1910,"affiliation":63,"orcid":63},{"paper_id":6025,"author_seq":218,"given_name":1912,"surname":1913,"affiliation":63,"orcid":63},"The locations in a tweet do not always indicate spatial information involving the author of the tweet. In this paper, we investigate whether authors are located or not located in the locations they tweet about, and temporally anchor this spatial information in the tweet timestamp. Specifically, we work with temporal tags centred around the tweet timestamp: longer than 24 hours before or after tweeting, within 24 hours before or after tweeting, and at the time of tweeting. We introduce a corpus of 1,200 location mentions from 1,062 tweets, discuss several annotation samples, and analyze annotator disagreements.",{"paper_id":6038,"title":6039,"year":81,"month":855,"day":63,"doi":6040,"resource_url":6041,"first_page":63,"last_page":63,"pdf_url":6042,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6043,"paper_type":860,"authors":6044,"abstract":6050},"lrec2018-main-281","MOCCA: Measure of Confidence for Corpus Analysis - Automatic Reliability Check of Transcript and Automatic Segmentation","10.63317\u002F2i4h95kjujmy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-281","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F43.pdf","kisler-schiel-2018-mocca",[6045,6047],{"paper_id":6038,"author_seq":247,"given_name":1615,"surname":6046,"affiliation":63,"orcid":63},"Kisler",{"paper_id":6038,"author_seq":232,"given_name":6048,"surname":6049,"affiliation":63,"orcid":63},"Florian","Schiel","The production of speech corpora typically involves manual labor to verify and correct the output of automatic transcription\u002Fsegmentation processes. This study investigates the possibility of speeding up this correction process using techniques borrowed from automatic speech recognition to predict the location of transcription or segmentation errors in the signal. This was achieved with functionals of features derived from a typical Hidden Markov Model (HMM)-based speech segmentation system and a classification\u002Fregression approach based on Support Vector Machine (SVM)\u002FSupport Vector Regression (SVR) and Random Forest (RF). Classifiers were tuned in a 10-fold cross validation on an annotated corpus of spontaneous speech. Tests on an independent speech corpus from a different domain showed that transcription errors were predicted with an accuracy of 78% using an SVM, while segmentation errors were predicted in the form of an overlap-measure which showed a Pearson correlation of 0.64 to a ground truth using Support Vector Regression (SVR). The methods described here will be implemented as free-to-use Common Language and Resources and Technology Infrastucture (CLARIN) web services.",{"paper_id":6052,"title":6053,"year":81,"month":855,"day":63,"doi":6054,"resource_url":6055,"first_page":63,"last_page":63,"pdf_url":6056,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6057,"paper_type":860,"authors":6058,"abstract":6063},"lrec2018-main-282","Towards an ISO Standard for the Annotation of Quantification","10.63317\u002F25nvjp8ssjgt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-282","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F406.pdf","bunt-etal-2018-towards",[6059,6060,6061],{"paper_id":6052,"author_seq":247,"given_name":3226,"surname":3227,"affiliation":63,"orcid":63},{"paper_id":6052,"author_seq":232,"given_name":1016,"surname":1017,"affiliation":63,"orcid":63},{"paper_id":6052,"author_seq":218,"given_name":6062,"surname":4032,"affiliation":63,"orcid":63},"Kiyong","This paper presents an approach to the annotation of quantification that is being developed in preparation of the specification of a quantification annotation scheme, as part of an effort of the International Organisation for Standardisation ISO to define interoperable semantic annotation schemes. The paper focuses on the theoretical basis for an ISO standard annotation scheme for quantification phenomena. It is argued that the combination of Generalized Quantifier Theory, neo-Davidsonian event-bases semantics, Discourse Representation Theory, and the ISO Principles of semantic annotation forms a powerful and solid foundation for defining annotations of quantification phenomena with              an abstract and a concrete syntax and a compositional semantics. The coverage of the proposed annotation scheme includes both count and mass NP quantifiers, as well as quantification by NPs with syntactically and semantically complex heads with internal quantification and scoping structures, such as inverse linking by prepositional phrases and relative clauses.",{"paper_id":6065,"title":6066,"year":81,"month":855,"day":63,"doi":6067,"resource_url":6068,"first_page":63,"last_page":63,"pdf_url":6069,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6070,"paper_type":860,"authors":6071,"abstract":6079},"lrec2018-main-283","Lightweight Grammatical Annotation in the TEI: New Perspectives","10.63317\u002F2kyv3tswod86","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-283","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F422.pdf","banski-etal-2018-lightweight",[6072,6074,6077],{"paper_id":6065,"author_seq":247,"given_name":996,"surname":6073,"affiliation":63,"orcid":63},"Bański",{"paper_id":6065,"author_seq":232,"given_name":6075,"surname":6076,"affiliation":63,"orcid":63},"Susanne","Haaf",{"paper_id":6065,"author_seq":218,"given_name":1248,"surname":6078,"affiliation":63,"orcid":63},"Mueller","In mid-2017, as part of our activities within the TEI Special Interest Group for Linguists (LingSIG), we submitted to the TEI  Technical Council a proposal for a new attribute class that would gather attributes facilitating simple token-level linguistic annotation. With this proposal, we addressed community feedback complaining about the lack of a specific tagset for lightweight linguistic annotation within the TEI. Apart from @lemma and @lemmaRef, up till now TEI encoders could only resort to using the generic attribute @ana for inline linguistic annotation, or to the quite complex system of feature structures for robust linguistic annotation, the latter requiring relatively complex processing even for the most basic types of linguistic features. As a result, there exists now a small set of basic descriptive devices which have been made available at the cost of only very small changes to the TEI tagset. The merit of a predefined TEI tagset for lightweight linguistic annotation is the homogeneity of tagging and thus better interoperability of simple linguistic resources encoded in the TEI. The present paper introduces the new attributes, makes a case for one more addition, and presents the advantages of the new system over the legacy TEI solutions.",{"paper_id":6081,"title":6082,"year":81,"month":855,"day":63,"doi":6083,"resource_url":6084,"first_page":63,"last_page":63,"pdf_url":6085,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6086,"paper_type":860,"authors":6087,"abstract":6097},"lrec2018-main-284","A Gold Standard for Multilingual Automatic Term Extraction from Comparable Corpora: Term Structure and Translation Equivalents","10.63317\u002F5ix2bvvntn8r","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-284","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F598.pdf","rigouts-terryn-etal-2018-gold",[6088,6091,6094],{"paper_id":6081,"author_seq":247,"given_name":6089,"surname":6090,"affiliation":63,"orcid":63},"Ayla","Rigouts Terryn",{"paper_id":6081,"author_seq":232,"given_name":6092,"surname":6093,"affiliation":63,"orcid":63},"Véronique","Hoste",{"paper_id":6081,"author_seq":218,"given_name":6095,"surname":6096,"affiliation":63,"orcid":63},"Els","Lefever","Terms are notoriously difficult to identify, both automatically and manually. This complicates the evaluation of the already challenging task of automatic term extraction. With the advent of multilingual automatic term extraction from comparable corpora, accurate evaluation becomes increasingly difficult, since term linking must be evaluated as well as term extraction.  A gold standard with manual annotations for a complete comparable corpus has been developed, based on a novel methodology created to accommodate for the intrinsic difficulties of this task. In this contribution, we show how the effort involved in the development of this gold standard resulted, not only in a tool for evaluation, but also in a rich source of information about terms. A detailed analysis of term characteristics illustrates how such knowledge about terms may inspire improvements for automatic term extraction.",{"paper_id":6099,"title":6100,"year":81,"month":855,"day":63,"doi":6101,"resource_url":6102,"first_page":63,"last_page":63,"pdf_url":6103,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6104,"paper_type":860,"authors":6105,"abstract":6116},"lrec2018-main-285","Handling Big Data and Sensitive Data Using EUDAT’s Generic Execution Framework and the WebLicht Workflow Engine.","10.63317\u002F5pnvhixpteod","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-285","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F497.pdf","zinn-etal-2018-handling",[6106,6107,6109,6110,6113],{"paper_id":6099,"author_seq":247,"given_name":1347,"surname":1348,"affiliation":63,"orcid":63},{"paper_id":6099,"author_seq":232,"given_name":2435,"surname":6108,"affiliation":63,"orcid":63},"Qui",{"paper_id":6099,"author_seq":218,"given_name":4656,"surname":4651,"affiliation":63,"orcid":63},{"paper_id":6099,"author_seq":203,"given_name":6111,"surname":6112,"affiliation":63,"orcid":63},"Emanuel","Dima",{"paper_id":6099,"author_seq":188,"given_name":6114,"surname":6115,"affiliation":63,"orcid":63},"Alexandr","Chernov","Web-based tools and workflow engines can often not be applied to data   with restrictive property rights and to big data. In both cases, it is better   to move the tools to the data rather than having the data travel to the tools.   In this paper, we report on the progress to bring together the CLARIN-based   WebLicht workflow engine with the EUDAT-based Generic Execution Framework to   address this issue.",{"paper_id":6118,"title":6119,"year":81,"month":855,"day":63,"doi":6120,"resource_url":6121,"first_page":63,"last_page":63,"pdf_url":6122,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6123,"paper_type":860,"authors":6124,"abstract":6132},"lrec2018-main-286","Building a Web-Scale Dependency-Parsed Corpus from CommonCrawl","10.63317\u002F4vstsqywjmip","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-286","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F215.pdf","panchenko-etal-2018-building",[6125,6126,6129,6130,6131],{"paper_id":6118,"author_seq":247,"given_name":2736,"surname":2737,"affiliation":63,"orcid":63},{"paper_id":6118,"author_seq":232,"given_name":6127,"surname":6128,"affiliation":63,"orcid":63},"Eugen","Ruppert",{"paper_id":6118,"author_seq":218,"given_name":2733,"surname":2734,"affiliation":63,"orcid":63},{"paper_id":6118,"author_seq":203,"given_name":5360,"surname":2742,"affiliation":63,"orcid":63},{"paper_id":6118,"author_seq":188,"given_name":1367,"surname":2739,"affiliation":63,"orcid":63},"We present DepCC, the largest-to-date linguistically analyzed corpus in English including 365 million documents, composed of 252 billion tokens and 7.5 billion of named entity occurrences in 14.3 billion sentences from a web-scale crawl of the Common Crawl project. The sentences are processed with a dependency parser and with a named entity tagger and contain provenance information, enabling various applications ranging from training syntax-based word embeddings to open information extraction and question answering. We built an index of all sentences and their linguistic meta-data enabling quick search across the corpus. We demonstrate the utility of this corpus on the verb similarity task by showing that a distributional model trained on our corpus yields better results than models trained on smaller corpora, like Wikipedia. This distributional model outperforms the state of art models of verb similarity trained on smaller corpora on the SimVerb3500 dataset.",{"paper_id":6134,"title":6135,"year":81,"month":855,"day":63,"doi":6136,"resource_url":6137,"first_page":63,"last_page":63,"pdf_url":6138,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6139,"paper_type":860,"authors":6140,"abstract":6162},"lrec2018-main-287","Universal Dependencies Version 2 for Japanese","10.63317\u002F4uskraiweqvd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-287","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F276.pdf","asahara-etal-2018-universal",[6141,6142,6144,6147,6150,6153,6155,6156,6159],{"paper_id":6134,"author_seq":247,"given_name":3912,"surname":3913,"affiliation":63,"orcid":63},{"paper_id":6134,"author_seq":232,"given_name":2172,"surname":6143,"affiliation":63,"orcid":63},"Kanayama",{"paper_id":6134,"author_seq":218,"given_name":6145,"surname":6146,"affiliation":63,"orcid":63},"Takaaki","Tanaka",{"paper_id":6134,"author_seq":203,"given_name":6148,"surname":6149,"affiliation":63,"orcid":63},"Yusuke","Miyao",{"paper_id":6134,"author_seq":188,"given_name":6151,"surname":6152,"affiliation":63,"orcid":63},"Sumire","Uematsu",{"paper_id":6134,"author_seq":172,"given_name":6154,"surname":3104,"affiliation":63,"orcid":63},"Shinsuke",{"paper_id":6134,"author_seq":155,"given_name":3649,"surname":2464,"affiliation":63,"orcid":63},{"paper_id":6134,"author_seq":138,"given_name":6157,"surname":6158,"affiliation":63,"orcid":63},"Mai","Omura",{"paper_id":6134,"author_seq":121,"given_name":6160,"surname":6161,"affiliation":63,"orcid":63},"Yugo","Murawaki","The Universal Dependencies (UD) project (McDonald et al., 2013) has defined a consistent, crosslinguistic target and syntactic structure representation format. In this presentation, we will show the work of the UD Japanese team. The UD Japanese team was organised by interested people who are developing their own treebanks or parsers. We developed and maintained several UD guidelines (version 2.0) compatible data for Japanese. Most of the data are made through automatic conversion from the existing treebank. The UD annotation guideline was updated from version 1 to version 2 in early 2017. The automatic conversion enabled us to adapt the existing annotation based on traditional Japanese grammar conventions for the UD annotation guideline changes. In this paper, we discuss the current issues of UD Japanese resources until today. These issues come from the difficulty to perform cross-linguistically consistent annotation for the different grammatical system from western European languages. The points at the issues related to the conversions are split into the delimitation (word, phrase and clause), undefined policies of UD guideline, typological systems for UD, and copyright of Japanese language resources.",{"paper_id":6164,"title":6165,"year":81,"month":855,"day":63,"doi":6166,"resource_url":6167,"first_page":63,"last_page":63,"pdf_url":6168,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6169,"paper_type":860,"authors":6170,"abstract":6175},"lrec2018-main-288","Developing the Bangla RST Discourse Treebank","10.63317\u002F56fzpsjj9h76","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-288","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F387.pdf","das-stede-2018-developing",[6171,6174],{"paper_id":6164,"author_seq":247,"given_name":6172,"surname":6173,"affiliation":63,"orcid":63},"Debopam","Das",{"paper_id":6164,"author_seq":232,"given_name":1058,"surname":5633,"affiliation":63,"orcid":63},"We present a corpus development project which builds a corpus in Bangla called the Bangla RST Discourse Treebank. The corpus contains a collection of 266 Bangla text, which are annotated for coherence relations (relations between propositions, such as Cause or Evidence). The texts represent the newspaper genre, which is further divided into eight sub-genres, such as business-related news, editorial columns and sport reports. We use Rhetorical Structure Theory (Mann and Thompson, 1988) as the theoretical framework of the corpus. In particular, we develop our annotation guidelines based on the guidelines used in the Potsdam Commentary Corpus (Stede, 2016). In the initial phase of the corpus development process, we have annotated 16 texts, and also conducted an inter-annotator agreement study, evaluating the reliability of our guidelines and the reproducibility of our annotation. The corpus upon its completion could be used as a valuable resource for conducting (cross-linguistic) discourse studies for Bangla, and also for developing various NLP applications, such as text summarization, machine translation or sentiment analysis.",{"paper_id":6177,"title":6178,"year":81,"month":855,"day":63,"doi":6179,"resource_url":6180,"first_page":63,"last_page":63,"pdf_url":6181,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6182,"paper_type":860,"authors":6183,"abstract":6191},"lrec2018-main-289","A New Version of the Składnica Treebank of Polish Harmonised with the Walenty Valency Dictionary","10.63317\u002F5fhaydmxij8t","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-289","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F765.pdf","wolinski-etal-2018-new",[6184,6186,6189],{"paper_id":6177,"author_seq":247,"given_name":5535,"surname":6185,"affiliation":63,"orcid":63},"Woliński",{"paper_id":6177,"author_seq":232,"given_name":6187,"surname":6188,"affiliation":63,"orcid":63},"Elżbieta","Hajnicz",{"paper_id":6177,"author_seq":218,"given_name":2155,"surname":6190,"affiliation":63,"orcid":63},"Bartosiak","This paper reports on developments in the Składnica treebank of Polish which were possible due to the switch to the Walenty valency dictionary. The change required several modifications in the Świgra parser, such as implementing unlike coordination, semantically motivated phrases, and non-standard case values. A procedure to upgrade manually disambiguated trees of Składnica was required as well. Modifications introduced in the treebank included systematic changes of notation and resolving ambiguity between semantically motivated phrases.  The procedure of confronting Składnica treebank with the trees generated with the new version of the Świgra parser using Walenty dictionary allowed us to check the consistency of all the resources. This resulted in several corrections introduced in both the treebank and the valence dictionary.",{"paper_id":6193,"title":6194,"year":81,"month":855,"day":63,"doi":6195,"resource_url":6196,"first_page":63,"last_page":63,"pdf_url":6197,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6198,"paper_type":860,"authors":6199,"abstract":6210},"lrec2018-main-290","Parse Me if You Can: Artificial Treebanks for Parsing Experiments on Elliptical Constructions","10.63317\u002F3ms66ftakthe","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-290","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1032.pdf","droganova-etal-2018-parse",[6200,6202,6204,6207],{"paper_id":6193,"author_seq":247,"given_name":5765,"surname":6201,"affiliation":63,"orcid":63},"Droganova",{"paper_id":6193,"author_seq":232,"given_name":2554,"surname":6203,"affiliation":63,"orcid":63},"Zeman",{"paper_id":6193,"author_seq":218,"given_name":6205,"surname":6206,"affiliation":63,"orcid":63},"Jenna","Kanerva",{"paper_id":6193,"author_seq":203,"given_name":6208,"surname":6209,"affiliation":63,"orcid":63},"Filip","Ginter","In this work we focus on a particular linguistic phenomenon, ellipsis, and explore the latest parsers in order to learn about parsing accuracy and typical errors from the perspective of elliptical constructions. For this purpose we collected and processed outputs of several state-of-the art parsers that took part in the CoNLL 2017 Shared Task. We extended the official shared task evaluation software to obtain focused evaluation of elliptical constructions. Since the studied structures are comparatively rare, and consequently there is not enough data for experimentation, we further describe the creation of a new resource, a semi-artificially constructed treebank of ellipsis.",{"paper_id":6212,"title":6213,"year":81,"month":855,"day":63,"doi":6214,"resource_url":6215,"first_page":63,"last_page":63,"pdf_url":6216,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6217,"paper_type":860,"authors":6218,"abstract":6228},"lrec2018-main-291","Semi-Automatic Construction of Word-Formation Networks (for Polish and Spanish)","10.63317\u002F3xxx33zke9sm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-291","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F231.pdf","lango-etal-2018-semi",[6219,6222,6225],{"paper_id":6212,"author_seq":247,"given_name":6220,"surname":6221,"affiliation":63,"orcid":63},"Mateusz","Lango",{"paper_id":6212,"author_seq":232,"given_name":6223,"surname":6224,"affiliation":63,"orcid":63},"Magda","Ševčíková",{"paper_id":6212,"author_seq":218,"given_name":6226,"surname":6227,"affiliation":63,"orcid":63},"Zdeněk","Žabokrtský","The paper presents a semi-automatic method for the construction of derivational networks. The proposed approach applies a sequential pattern mining technique in order to construct useful morphological features in an unsupervised manner. The features take the form of regular expressions and later are used to feed a machine-learned ranking model. The network is constructed by applying resulting model to sort the lists of possible base words and selecting the most probable ones. This approach, besides relatively small training set and a lexicon, does not require any additional language resources such as a list of alternations groups, POS tags etc. The proposed approach is applied to the lexeme sets of two languages, namely Polish and Spanish, which results in the establishment of two novel word-formation networks. Finally, the network constructed for Polish is merged with the derivational connections extracted from the Polish WordNet and those resulting from the derivational rules developed by a linguist, resulting in the biggest word-formation network for that language.  The presented approach is general enough to be adopted for other languages.",{"paper_id":6230,"title":6231,"year":81,"month":855,"day":63,"doi":6232,"resource_url":6233,"first_page":63,"last_page":63,"pdf_url":6234,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6235,"paper_type":860,"authors":6236,"abstract":6240},"lrec2018-main-292","A multilingual collection of CoNLL-U-compatible morphological lexicons","10.63317\u002F24it944bfjwx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-292","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F705.pdf","sagot-2018-multilingual",[6237],{"paper_id":6230,"author_seq":247,"given_name":6238,"surname":6239,"affiliation":63,"orcid":63},"Benoît","Sagot","We introduce UDLexicons, a multilingual collection of morphological lexicons that follow the guidelines and format of the Universal Dependencies initiative. We describe the three approaches we use to create 53 morphological lexicons covering 38 languages, based on existing resources. These lexicons, which are freely available, have already proven useful for improving part-of-speech tagging accuracy in state-of-the-art architectures.",{"paper_id":6242,"title":6243,"year":81,"month":855,"day":63,"doi":6244,"resource_url":6245,"first_page":63,"last_page":63,"pdf_url":6246,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6247,"paper_type":860,"authors":6248,"abstract":6279},"lrec2018-main-293","UniMorph 2.0: Universal Morphology","10.63317\u002F3rq3n9kr4gk8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-293","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F789.pdf","kirov-etal-2018-unimorph",[6249,6252,6254,6256,6258,6260,6261,6264,6267,6269,6272,6273,6276],{"paper_id":6242,"author_seq":247,"given_name":6250,"surname":6251,"affiliation":63,"orcid":63},"Christo","Kirov",{"paper_id":6242,"author_seq":232,"given_name":3118,"surname":6253,"affiliation":63,"orcid":63},"Cotterell",{"paper_id":6242,"author_seq":218,"given_name":1734,"surname":6255,"affiliation":63,"orcid":63},"Sylak-Glassman",{"paper_id":6242,"author_seq":203,"given_name":1121,"surname":6257,"affiliation":63,"orcid":63},"Walther",{"paper_id":6242,"author_seq":188,"given_name":2130,"surname":6259,"affiliation":63,"orcid":63},"Vylomova",{"paper_id":6242,"author_seq":172,"given_name":2451,"surname":1386,"affiliation":63,"orcid":63},{"paper_id":6242,"author_seq":155,"given_name":6262,"surname":6263,"affiliation":63,"orcid":63},"Manaal","Faruqui",{"paper_id":6242,"author_seq":138,"given_name":6265,"surname":6266,"affiliation":63,"orcid":63},"Sabrina J.","Mielke",{"paper_id":6242,"author_seq":121,"given_name":6268,"surname":3753,"affiliation":63,"orcid":63},"Arya",{"paper_id":6242,"author_seq":104,"given_name":6270,"surname":6271,"affiliation":63,"orcid":63},"Sandra","Kübler",{"paper_id":6242,"author_seq":87,"given_name":1199,"surname":3705,"affiliation":63,"orcid":63},{"paper_id":6242,"author_seq":73,"given_name":6274,"surname":6275,"affiliation":63,"orcid":63},"Jason","Eisner",{"paper_id":6242,"author_seq":55,"given_name":6277,"surname":6278,"affiliation":63,"orcid":63},"Mans","Hulden","The Universal Morphology (UniMorph) project is a collaborative effort to improve how NLP handles complex morphology across the world's languages. The project releases annotated morphological data using a universal tagset, the UniMorph schema. Each inflected form is associated with a lemma, which typically carries its underlying lexical meaning, and a bundle of morphological features from our schema. Additional supporting data and tools are also released on a per-language basis when available. UniMorph is based at the Center for Language and Speech Processing (CLSP) at Johns Hopkins University in Baltimore, Maryland. This paper details advances made to the collection, annotation, and dissemination of project resources since the initial UniMorph release described at LREC 2016.",{"paper_id":6281,"title":6282,"year":81,"month":855,"day":63,"doi":6283,"resource_url":6284,"first_page":63,"last_page":63,"pdf_url":6285,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6286,"paper_type":860,"authors":6287,"abstract":6299},"lrec2018-main-294","A Computational Architecture for the Morphology of Upper Tanana","10.63317\u002F4jo5xbtb2sex","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-294","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F816.pdf","lovick-etal-2018-computational",[6288,6290,6292,6295,6298],{"paper_id":6281,"author_seq":247,"given_name":3749,"surname":6289,"affiliation":63,"orcid":63},"Lovick",{"paper_id":6281,"author_seq":232,"given_name":1359,"surname":6291,"affiliation":63,"orcid":63},"Cox",{"paper_id":6281,"author_seq":218,"given_name":6293,"surname":6294,"affiliation":63,"orcid":63},"Miikka","Silfverberg",{"paper_id":6281,"author_seq":203,"given_name":6296,"surname":6297,"affiliation":63,"orcid":63},"Antti","Arppe",{"paper_id":6281,"author_seq":188,"given_name":6277,"surname":6278,"affiliation":63,"orcid":63},"In this paper, we describe a computational model of Upper Tanana, a highly endangered Dene (Athabaskan) language spoken in eastern interior Alaska (USA) and in the Yukon Territory (Canada). This model not only parses and generates Upper Tanana verb forms, but uses the language's verb theme category system, a system of lexical-inflectional verb classes, to additionally predict possible derivations and their morphological behavior. This allows us to model a large portion of the Upper Tanana verb lexicon, making it more accessible to learners and scholars alike. Generated derivations will be compared against the narrative corpus of the language as well to the (much more comprehensive) lexical documentation of closely related languages.",{"paper_id":6301,"title":6302,"year":81,"month":855,"day":63,"doi":6303,"resource_url":6304,"first_page":63,"last_page":63,"pdf_url":6305,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6306,"paper_type":860,"authors":6307,"abstract":6309},"lrec2018-main-295","Expanding Abbreviations in a Strongly Inflected Language: Are Morphosyntactic Tags Sufficient?","10.63317\u002F33k7tyepaxs4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-295","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F964.pdf","zelasko-2018-expanding",[6308],{"paper_id":6301,"author_seq":247,"given_name":996,"surname":2150,"affiliation":63,"orcid":63},"In this paper, the problem of recovery of morphological information lost in abbreviated forms is addressed with a focus on highly inflected languages. Evidence is presented that the correct inflected form of an expanded abbreviation can in many cases be deduced solely from the morphosyntactic tags of the context. The prediction model is a deep bidirectional LSTM network with tag embedding. The training and evaluation data are gathered by finding the words which could have been abbreviated and using their corresponding morphosyntactic tags as the labels, while the tags of the context words are used as the input features for classification. The network is trained on over 10 million words from the Polish Sejm Corpus and achieves 74.2% prediction accuracy on a smaller, but more general National Corpus of Polish. The analysis of errors suggests that performance in this task may improve if some prior knowledge about the abbreviated word is incorporated into the model.",{"paper_id":6311,"title":6312,"year":81,"month":855,"day":63,"doi":6313,"resource_url":6314,"first_page":63,"last_page":63,"pdf_url":6315,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6316,"paper_type":860,"authors":6317,"abstract":6324},"lrec2018-main-296","A High-Quality Gold Standard for Citation-based Tasks","10.63317\u002F2p2zuvfafqmr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-296","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F283.pdf","farber-etal-2018-high",[6318,6320,6322],{"paper_id":6311,"author_seq":247,"given_name":1780,"surname":6319,"affiliation":63,"orcid":63},"Färber",{"paper_id":6311,"author_seq":232,"given_name":2736,"surname":6321,"affiliation":63,"orcid":63},"Thiemann",{"paper_id":6311,"author_seq":218,"given_name":3370,"surname":6323,"affiliation":63,"orcid":63},"Jatowt","Analyzing and recommending citations within their specific citation contexts has recently received much attention due to the growing number of available publications. Although data sets such as CiteSeerX have been created for evaluating approaches for such tasks, those data sets exhibit striking defects. This is understandable when one considers that both information extraction and entity linking, as well as entity resolution, need to be performed. In this paper, we propose a new evaluation data set for citation-dependent tasks based on arXiv.org publications. Our data set is characterized by the fact that it exhibits almost zero noise in its extracted content and that all citations are linked to their correct publications. Besides the pure content, available on a sentence-by-sentence basis, cited publications are annotated directly in the text via global identifiers. As far as possible, referenced publications are further linked to the DBLP Computer Science Bibliography. Our data set consists of over 15 million sentences and is freely available for research purposes. It can be used for training and testing citation-based tasks, such as recommending citations, determining the functions or importance of citations, and summarizing documents based on their citations.",{"paper_id":6326,"title":6327,"year":81,"month":855,"day":63,"doi":6328,"resource_url":6329,"first_page":63,"last_page":63,"pdf_url":6330,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6331,"paper_type":860,"authors":6332,"abstract":6340},"lrec2018-main-297","Measuring Innovation in Speech and Language Processing Publications.","10.63317\u002F3fepb73omudu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-297","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F522.pdf","mariani-etal-2018-measuring",[6333,6336,6339],{"paper_id":6326,"author_seq":247,"given_name":6334,"surname":6335,"affiliation":63,"orcid":63},"Joseph","Mariani",{"paper_id":6326,"author_seq":232,"given_name":6337,"surname":6338,"affiliation":63,"orcid":63},"Gil","Francopoulo",{"paper_id":6326,"author_seq":218,"given_name":2451,"surname":2452,"affiliation":63,"orcid":63},"The goal of this paper is to propose measures of innovation through the study of publications in the field of speech and language processing. It is based on the NLP4NLP corpus, which contains the articles published in major conferences and journals related to speech and language processing over 50 years (1965-2015). It represents 65,003 documents from 34 different sources, conferences and journals, published by 48,894 different authors in 558 events, for a total of more than 270 million words and 324,422 bibliographical references. The data was obtained in textual form or as an image that had to be converted into text. This resulted in a lower quality for the most ancient papers, that we measured through the computation of an unknown word ratio. The multi-word technical terms were automatically extracted after parsing, using a set of general language text corpora. The occurrences, frequencies, existences and presences of the terms were then computed overall, for each year and for each document. It resulted in a list of 3.5 million different terms and 24 million term occurrences. The evolution of the research topics over the year, as reflected by the terms presence, was then computed and we propose a measure of the topic popularity based on this computation. The author(s) who introduced the terms were searched for, together with the year when the term was first introduced and the publication where it was introduced. We then studied the global and evolutional contributions of authors to a given topic. We also studied the global and evolutional contributions of the various publications to a given topic. We finally propose a measure of innovativeness for authors and publications.",{"paper_id":6342,"title":6343,"year":81,"month":855,"day":63,"doi":6344,"resource_url":6345,"first_page":63,"last_page":63,"pdf_url":6346,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6347,"paper_type":860,"authors":6348,"abstract":6359},"lrec2018-main-298","PDFdigest: an Adaptable Layout-Aware PDF-to-XML Textual Content Extractor for Scientific Articles","10.63317\u002F3qtpno673czs","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-298","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1009.pdf","ferres-etal-2018-pdfdigest",[6349,6351,6354,6356],{"paper_id":6342,"author_seq":247,"given_name":2554,"surname":6350,"affiliation":63,"orcid":63},"Ferrés",{"paper_id":6342,"author_seq":232,"given_name":6352,"surname":6353,"affiliation":63,"orcid":63},"Horacio","Saggion",{"paper_id":6342,"author_seq":218,"given_name":5824,"surname":6355,"affiliation":63,"orcid":63},"Ronzano",{"paper_id":6342,"author_seq":203,"given_name":6357,"surname":6358,"affiliation":63,"orcid":63},"Àlex","Bravo","The availability of automated approaches and tools to extract structured textual content from PDF articles is essential to enable scientific text mining. This paper describes and evaluates the PDFdigest tool, a PDF-to-XML textual content extraction system specially designed to extract scientific articles' headings and logical structure  (title, authors, abstract,...) and its textual content. The extractor deals with both text-based and image-based PDF articles using custom rule-based algorithms based on existing state-of-the-art open-source tools for both PDF-to-HTML conversion and image-based PDF Optical Character Recognition.",{"paper_id":6361,"title":6362,"year":81,"month":855,"day":63,"doi":6363,"resource_url":6364,"first_page":63,"last_page":63,"pdf_url":6365,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6366,"paper_type":860,"authors":6367,"abstract":6398},"lrec2018-main-299","Automatic Identification of Research Fields in Scientific Papers","10.63317\u002F555ye9umecg5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-299","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1067.pdf","kergosien-etal-2018-automatic",[6368,6370,6373,6376,6379,6382,6385,6388,6391,6393,6395],{"paper_id":6361,"author_seq":247,"given_name":5665,"surname":6369,"affiliation":63,"orcid":63},"Kergosien",{"paper_id":6361,"author_seq":232,"given_name":6371,"surname":6372,"affiliation":63,"orcid":63},"Amin","Farvardin",{"paper_id":6361,"author_seq":218,"given_name":6374,"surname":6375,"affiliation":63,"orcid":63},"Maguelonne","Teisseire",{"paper_id":6361,"author_seq":203,"given_name":6377,"surname":6378,"affiliation":63,"orcid":63},"Marie-Noëlle","Bessagnet",{"paper_id":6361,"author_seq":188,"given_name":6380,"surname":6381,"affiliation":63,"orcid":63},"Joachim","Schöpfel",{"paper_id":6361,"author_seq":172,"given_name":6383,"surname":6384,"affiliation":63,"orcid":63},"Stéphane","Chaudiron",{"paper_id":6361,"author_seq":155,"given_name":6386,"surname":6387,"affiliation":63,"orcid":63},"Bernard","Jacquemin",{"paper_id":6361,"author_seq":138,"given_name":6389,"surname":6390,"affiliation":63,"orcid":63},"Annig","Lacayrelle",{"paper_id":6361,"author_seq":121,"given_name":5190,"surname":6392,"affiliation":63,"orcid":63},"Roche",{"paper_id":6361,"author_seq":104,"given_name":904,"surname":6394,"affiliation":63,"orcid":63},"Sallaberry",{"paper_id":6361,"author_seq":87,"given_name":6396,"surname":6397,"affiliation":63,"orcid":63},"Jean Philippe","Tonneau","The TERRE-ISTEX project aims to identify scientific research dealing with specific geographical territories areas based on heterogeneous digital content available in scientific papers. The project is divided into three main work packages: (1) identification of the periods and places of empirical studies, and which reflect the publications resulting from the analyzed text samples, (2) identification of the themes which appear in these documents, and (3) development of a web-based geographical information retrieval tool (GIR). The first two actions combine Natural Language Processing patterns with text mining methods. The integration of the spatial, thematic and temporal dimensions in a GIR contributes to a better understanding of what kind of research has been carried out, of its topics and its geographical and historical coverage. Another originality of the TERRE-ISTEX project is the heterogeneous character of the corpus, including PhD theses and scientific articles from the ISTEX digital libraries and the CIRAD research center.",{"paper_id":6400,"title":6401,"year":81,"month":855,"day":63,"doi":6402,"resource_url":6403,"first_page":63,"last_page":63,"pdf_url":6404,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6405,"paper_type":860,"authors":6406,"abstract":6412},"lrec2018-main-300","A «Portrait» Approach to Multichannel Discourse","10.63317\u002F5cz8cqkqsuxp","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-300","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F127.pdf","kibrik-fedorova-2018-portrait",[6407,6410],{"paper_id":6400,"author_seq":247,"given_name":6408,"surname":6409,"affiliation":63,"orcid":63},"Andrej","Kibrik",{"paper_id":6400,"author_seq":232,"given_name":3749,"surname":6411,"affiliation":63,"orcid":63},"Fedorova","This paper contributes to the research field of multichannel discourse analysis. Multimodal discourse analysis explores numerous channels involved in natural communication, such as verbal structure, prosody, gesticulation, facial expression, eye gaze, etc., and treats them as parts of an integral process. Among the key issues in multichannel studies is the question of the individual variation in multichannel behavior. We address this issue with the help of a multichannel resource “Russian Pear Chats and Stories” that is currently under construction (multidiscourse.ru). This corpus is based on a novel methodology of data collection and is produced with the help of state of the art technology including eyetracking. To address the issue of individual variation, we introduce the notion of a speaker’s individual portrait. In particular, we consider the Prosodic Portrait, the Oculomotor Portrait, and the Gesticulation Portrait. The proposed methodology is crucially important for fine-grained annotation procedures as well as for accurate statistic analyses of multichannel data.",{"paper_id":6414,"title":6415,"year":81,"month":855,"day":63,"doi":6416,"resource_url":6417,"first_page":63,"last_page":63,"pdf_url":6418,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6419,"paper_type":860,"authors":6420,"abstract":6428},"lrec2018-main-301","Multilingual Extension of PDTB-Style Annotation: The Case of TED Multilingual Discourse Bank","10.63317\u002F57ndug7eq9xq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-301","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F141.pdf","zeyrek-etal-2018-multilingual",[6421,6424,6425],{"paper_id":6414,"author_seq":247,"given_name":6422,"surname":6423,"affiliation":63,"orcid":63},"Deniz","Zeyrek",{"paper_id":6414,"author_seq":232,"given_name":3893,"surname":3894,"affiliation":63,"orcid":63},{"paper_id":6414,"author_seq":218,"given_name":6426,"surname":6427,"affiliation":63,"orcid":63},"Murathan","Kurfalı","We introduce TED-Multilingual Discourse Bank, a corpus of TED talks transcripts in 6 languages (English, German, Polish, European Portuguese, Russian and Turkish), where the ultimate aim is to provide a clearly described level of discourse structure and semantics in multiple languages. The corpus is manually annotated following the goals and principles of PDTB, involving explicit and implicit discourse connectives, entity relations, alternative lexicalizations and no relations. In the corpus, we also aim to capture the characteristics of spoken language that exist in the transcripts and adapt the PDTB scheme according to our aims; for example, we introduce hypophora. We spot other aspects of spoken discourse such as the discourse marker use of connectives to keep them distinct from their discourse connective use. TED-MDB is, to the best of our knowledge, one of the few multilingual discourse treebanks and is hoped to be a source of parallel data for contrastive linguistic analysis as well as language technology applications. We describe the corpus, the annotation procedure and provide preliminary corpus statistics.",{"paper_id":6430,"title":6431,"year":81,"month":855,"day":63,"doi":6432,"resource_url":6433,"first_page":63,"last_page":63,"pdf_url":6434,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6435,"paper_type":860,"authors":6436,"abstract":6447},"lrec2018-main-302","Building a Macro Chinese Discourse Treebank","10.63317\u002F4zyzutwstqut","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-302","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F147.pdf","chu-etal-2018-building",[6437,6440,6442,6444],{"paper_id":6430,"author_seq":247,"given_name":6438,"surname":6439,"affiliation":63,"orcid":63},"Xiaomin","Chu",{"paper_id":6430,"author_seq":232,"given_name":2590,"surname":6441,"affiliation":63,"orcid":63},"Jiang",{"paper_id":6430,"author_seq":218,"given_name":6443,"surname":1585,"affiliation":63,"orcid":63},"Sheng",{"paper_id":6430,"author_seq":203,"given_name":6445,"surname":6446,"affiliation":63,"orcid":63},"Qiaoming","Zhu","Discourse structure analysis is an important research topic in natural language processing. Discourse structure analysis not only helps to understand the discourse structure and semantics, but also provides strong support for deep applications of natural language processing, such as automatic summarization, statistical machine translation, question and answering, etc. At present, the analyses of discourse structure are mainly concentrated on the micro level, while the analyses on macro level are few. Therefore, this paper focuses on the construction of representation schema and corpus resources on the macro level of discourse structure. This paper puts forward a macro discourse structure framework and constructs the logical semantic structure and functional pragmatic structure respectively. On this basis, a macro Chinese discourse structure treebank is annotated, consisting of 147 Newswire articles. Preliminary experimental results show that the representation schema and corpus resource constructed in this paper can lay the foundation for further analysis of macro discourse structure.",{"paper_id":6449,"title":6450,"year":81,"month":855,"day":63,"doi":6451,"resource_url":6452,"first_page":63,"last_page":63,"pdf_url":6453,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6454,"paper_type":860,"authors":6455,"abstract":6462},"lrec2018-main-303","Enhancing the AI2 Diagrams Dataset Using Rhetorical Structure Theory","10.63317\u002F536kevb657cc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-303","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F172.pdf","hiippala-orekhova-2018-enhancing",[6456,6459],{"paper_id":6449,"author_seq":247,"given_name":6457,"surname":6458,"affiliation":63,"orcid":63},"Tuomo","Hiippala",{"paper_id":6449,"author_seq":232,"given_name":6460,"surname":6461,"affiliation":63,"orcid":63},"Serafina","Orekhova","This paper describes ongoing work on a multimodal resource based on the Allen Institute AI2 Diagrams (AI2D) dataset, which contains nearly 5000 grade-school level science diagrams that have been annotated for their elements and the semantic relations that hold between them. This emerging resource, named AI2D-RST, aims to provide a drop-in replacement for the annotation of semantic relations between diagram elements, whose description is informed by recent theories of multimodality and text-image relations. As the name of the resource suggests, the revised annotation schema is based on Rhetorical Structure Theory (RST), which has been previously used to describe the multimodal structure of diagrams and entire documents. The paper documents the proposed annotation schema, describes challenges in applying RST to diagrams, and reports on inter-annotator agreement for this task. Finally, the paper discusses the use of AI2D-RST for research on multimodality and artificial intelligence.",{"paper_id":6464,"title":6465,"year":81,"month":855,"day":63,"doi":6466,"resource_url":6467,"first_page":63,"last_page":63,"pdf_url":6468,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6469,"paper_type":860,"authors":6470,"abstract":6480},"lrec2018-main-304","QUD-Based Annotation of Discourse Structure and Information Structure: Tool and Evaluation","10.63317\u002F368a5yfgwg9q","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-304","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F241.pdf","de-kuthy-etal-2018-qud",[6471,6474,6477],{"paper_id":6464,"author_seq":247,"given_name":6472,"surname":6473,"affiliation":63,"orcid":63},"Kordula","De Kuthy",{"paper_id":6464,"author_seq":232,"given_name":6475,"surname":6476,"affiliation":63,"orcid":63},"Nils","Reiter",{"paper_id":6464,"author_seq":218,"given_name":6478,"surname":6479,"affiliation":63,"orcid":63},"Arndt","Riester","We discuss and evaluate a new annotation scheme and discourse-analytic method, the QUD-tree framework. We present an annotation study, in which the framework, based on the concept of Questions under Discussion, is applied to English and German interview data, using TreeAnno, an annotation tool specially developed for this new kind of discourse annotation. The results of an inter-annotator agreement study show that the new annotation method allows for reasonable agreement with regard to discourse structure and good agreement with regard to the annotation of information structure, which covers focus, background, contrastive topic and non-at-issue material.",{"paper_id":6482,"title":6483,"year":81,"month":855,"day":63,"doi":6484,"resource_url":6485,"first_page":63,"last_page":63,"pdf_url":6486,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6487,"paper_type":860,"authors":6488,"abstract":6496},"lrec2018-main-305","The Spot the Difference corpus: a multi-modal corpus of spontaneous task oriented spoken interactions","10.63317\u002F3rgnfnavydsx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-305","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F410.pdf","lopes-etal-2018-spot",[6489,6492,6494],{"paper_id":6482,"author_seq":247,"given_name":6490,"surname":6491,"affiliation":63,"orcid":63},"José","Lopes",{"paper_id":6482,"author_seq":232,"given_name":6475,"surname":6493,"affiliation":63,"orcid":63},"Hemmingsson",{"paper_id":6482,"author_seq":218,"given_name":1049,"surname":6495,"affiliation":63,"orcid":63},"Åstrand","This paper describes the Spot the Difference Corpus which contains 54 interactions between pairs of subjects interacting to find differences in two very similar scenes. The setup used, the participants' metadata and details about collection are described. We are releasing this corpus of task-oriented spontaneous dialogues. This release includes rich transcriptions, annotations, audio and video. We believe that this dataset constitutes a valuable resource to study several dimensions of human communication that go from turn-taking to the study of referring expressions. In our preliminary analyses we have looked at task success (how many differences were found out of the total number of differences) and how it evolves over time. In addition we have looked at scene complexity provided by the RGB components' entropy and how it could relate to speech overlaps, interruptions and the expression of uncertainty. We found there is a tendency that more complex scenes have more competitive interruptions.",{"paper_id":6498,"title":6499,"year":81,"month":855,"day":63,"doi":6500,"resource_url":6501,"first_page":63,"last_page":63,"pdf_url":6502,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6503,"paper_type":860,"authors":6504,"abstract":6510},"lrec2018-main-306","Attention for Implicit Discourse Relation Recognition","10.63317\u002F4rme2bjhe3u5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-306","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F451.pdf","cianflone-kosseim-2018-attention",[6505,6507],{"paper_id":6498,"author_seq":247,"given_name":1477,"surname":6506,"affiliation":63,"orcid":63},"Cianflone",{"paper_id":6498,"author_seq":232,"given_name":6508,"surname":6509,"affiliation":63,"orcid":63},"Leila","Kosseim","Implicit discourse relation recognition remains a challenging task as state-of-the-art approaches reach F1 scores ranging from 9.95% to 37.67% on the 2016 CoNLL shared task.  In our work, we explore the use of a neural network which exploits the strong correlation between pairs of words across two discourse arguments that implicitly signal a discourse relation. We present a novel approach to Implicit Discourse Relation Recognition that uses an encoder-decoder model with attention. Our approach is based on the assumption that a discourse argument is \"generated\" from a previous argument and conditioned on a latent discourse relation, which we detect.  Experiments show that our model achieves an F1 score of 38.25% on fine-grained classification, outperforming previous approaches and performing comparatively with state-of-the-art on coarse-grained classification, while computing alignment parameters without the need for additional pooling and fully connected layers.",{"paper_id":6512,"title":6513,"year":81,"month":855,"day":63,"doi":6514,"resource_url":6515,"first_page":63,"last_page":63,"pdf_url":6516,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6517,"paper_type":860,"authors":6518,"abstract":6529},"lrec2018-main-307","A Context-based Approach for Dialogue Act Recognition using Simple Recurrent Neural Networks","10.63317\u002F2b2w5ksnfqov","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-307","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F525.pdf","bothe-etal-2018-context",[6519,6522,6525,6527],{"paper_id":6512,"author_seq":247,"given_name":6520,"surname":6521,"affiliation":63,"orcid":63},"Chandrakant","Bothe",{"paper_id":6512,"author_seq":232,"given_name":6523,"surname":6524,"affiliation":63,"orcid":63},"Cornelius","Weber",{"paper_id":6512,"author_seq":218,"given_name":1448,"surname":6526,"affiliation":63,"orcid":63},"Magg",{"paper_id":6512,"author_seq":203,"given_name":1471,"surname":6528,"affiliation":63,"orcid":63},"Wermter","Dialogue act recognition is an important part of natural language understanding. We investigate the way dialogue act corpora are annotated and the learning approaches used so far. We find that the dialogue act is context-sensitive within the conversation for most of the classes. Nevertheless, previous models of dialogue act classification work on the utterance-level and only very few consider context. We propose a novel context-based learning method to classify dialogue acts using a character-level language model utterance representation, and we notice significant improvement. We evaluate this method on the Switchboard Dialogue Act corpus, and our results show that the consideration of the preceding utterances as a context of the current utterance improves dialogue act detection.",{"paper_id":6531,"title":6532,"year":81,"month":855,"day":63,"doi":6533,"resource_url":6534,"first_page":63,"last_page":63,"pdf_url":6535,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6536,"paper_type":860,"authors":6537,"abstract":6548},"lrec2018-main-308","TreeAnnotator: Versatile Visual Annotation of Hierarchical Text Relations","10.63317\u002F2kvui6d689hn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-308","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F591.pdf","helfrich-etal-2018-treeannotator",[6538,6540,6543,6544,6547],{"paper_id":6531,"author_seq":247,"given_name":1682,"surname":6539,"affiliation":63,"orcid":63},"Helfrich",{"paper_id":6531,"author_seq":232,"given_name":6541,"surname":6542,"affiliation":63,"orcid":63},"Elias","Rieb",{"paper_id":6531,"author_seq":218,"given_name":1679,"surname":4803,"affiliation":63,"orcid":63},{"paper_id":6531,"author_seq":203,"given_name":6545,"surname":6546,"affiliation":63,"orcid":63},"Andy","Lücking",{"paper_id":6531,"author_seq":188,"given_name":2736,"surname":4012,"affiliation":63,"orcid":63},"We introduce TreeAnnotator, a graphical tool for annotating tree-like structures, in particular structures that jointly map dependency relations and inclusion hierarchies, as used by Rhetorical Structure Theory (RST). TreeAnnotator is browser-based, embedded within the UIMA framework and provides two visualization modes. TreeAnnotator’s interoperability exceeds similar tools, providing a wider range of formats, while annotation work can be completed more quickly due to a revised input method for RST dependency relations. TreeAnnotator offers a multiple window view, which allows users to inspect several annotations side by side. For storing and versioning annotations, the UIMA Database Interface (UIMA DI) was developed to save documents based on a pre-defined type system. These features not only connect TreeAnnotator annotations to modern technological and dialog theoretical work, but set it apart from related tools. The ease of use of TreeAnnotator and its newly designed user interface is evaluated in a user study consisting of annotating rhetorical relations with TreeAnnotator and the classic RSTTool.",{"paper_id":6550,"title":6551,"year":81,"month":855,"day":63,"doi":6552,"resource_url":6553,"first_page":63,"last_page":63,"pdf_url":6554,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6555,"paper_type":860,"authors":6556,"abstract":6562},"lrec2018-main-309","Chats and Chunks: Annotation and Analysis of Multiparty Long Casual Conversations","10.63317\u002F2anbrxfygss2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-309","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F653.pdf","gilmartin-etal-2018-chats",[6557,6560,6561],{"paper_id":6550,"author_seq":247,"given_name":6558,"surname":6559,"affiliation":63,"orcid":63},"Emer","Gilmartin",{"paper_id":6550,"author_seq":232,"given_name":4210,"surname":4211,"affiliation":63,"orcid":63},{"paper_id":6550,"author_seq":218,"given_name":3201,"surname":3202,"affiliation":63,"orcid":63},"Casual talk or social conversation is a fundamental form of spoken interaction. Corpora of casual talk often comprise relatively short dyadic conversations, although research into such talk has found longer multiparty interaction to be very common. This genre of spoken interaction is attracting more interest with attempts to build more friendly and natural spoken dialog systems. To study longer multiparty casual talk, we have assembled a collection of conversations from three existing corpora. We describe the collection, organization, and annotation of structural chat and chunk phases in  these conversations. we then review our preliminary results, noting significant differences in the distribution of overlap, laughter and disfluency in chat and chunk phases, and finding that chunk dominates as conversations get longer. We outline our continuing work on gaining greater understanding of this genre of spoken interaction, with implications for the design of spoken dialog systems.",{"paper_id":6564,"title":6565,"year":81,"month":855,"day":63,"doi":6566,"resource_url":6567,"first_page":63,"last_page":63,"pdf_url":6568,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6569,"paper_type":860,"authors":6570,"abstract":6579},"lrec2018-main-310","Extending the gold standard for a lexical substitution task: is it worth it?","10.63317\u002F3x7prk7ms464","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-310","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F86.pdf","tanguy-etal-2018-extending",[6571,6574,6577],{"paper_id":6564,"author_seq":247,"given_name":6572,"surname":6573,"affiliation":63,"orcid":63},"Ludovic","Tanguy",{"paper_id":6564,"author_seq":232,"given_name":6575,"surname":6576,"affiliation":63,"orcid":63},"Cécile","Fabre",{"paper_id":6564,"author_seq":218,"given_name":1172,"surname":6578,"affiliation":63,"orcid":63},"Rivière","We present a new evaluation scheme for the lexical substitution task. Following (McCarthy and Navigli, 2007) we conducted an annotation task for French that mixes two datasets: in the first one, 300 sentences containing a target word (among 30 different) were submitted to annotators who were asked to provide substitutes. The second one contains the propositions of the systems that participated to the lexical substitution task based on the same data. The idea is first, to assess the capacity of the systems to provide good substitutes that would not have been proposed by the annotators and second, to measure the impact on the task evaluation of a new gold standard that incorporates these additional data. While (McCarthy and Navigli, 2009) have conducted a similar post hoc analysis, re-evaluation of the systems' performances has not been carried out to our knowledge. This experiment shows interesting differences between the two resulting datasets and gives insight on how automatically retrieved substitutes can provide complementary data to a lexical production task, without however a major impact on the evaluation of the systems.",{"paper_id":6581,"title":6582,"year":81,"month":855,"day":63,"doi":6583,"resource_url":6584,"first_page":63,"last_page":63,"pdf_url":6585,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6586,"paper_type":860,"authors":6587,"abstract":6592},"lrec2018-main-311","Lexical and Semantic Features for Cross-lingual Text Reuse Classification: an Experiment in English and Latin Paraphrases","10.63317\u002F459omhdr74tm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-311","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F145.pdf","moritz-steding-2018-lexical",[6588,6590],{"paper_id":6581,"author_seq":247,"given_name":2301,"surname":6589,"affiliation":63,"orcid":63},"Moritz",{"paper_id":6581,"author_seq":232,"given_name":1199,"surname":6591,"affiliation":63,"orcid":63},"Steding","Analyzing historical languages, such as Ancient Greek and Latin, is challenging. Such languages are often under-resourced and lack primary material for certain time periods. This prevents applying advanced natural-language processing (NLP) techniques and requires resorting to basic NLP not relying on machine learning. An important analysis is the discovery and classification of paraphrastic text reuse in historical languages. This reuse is often paraphrastic and challenges basic NLP techniques. Our goal is to improve the applicability of advanced NLP techniques on historical text reuse. We present an experiment of cross-applying classifiers—that we trained for paraphrase recognition on modern English text corpora—on historical texts. We analyze the impact of four different lexical and semantic features, on the resulting reuse-detection accuracy. We find out that—against initial conjecture—word embedding can help to drastically improve accuracy if lexical features (such as the overlap of similar words) fail.",{"paper_id":6594,"title":6595,"year":81,"month":855,"day":63,"doi":6596,"resource_url":6597,"first_page":63,"last_page":63,"pdf_url":6598,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6599,"paper_type":860,"authors":6600,"abstract":6604},"lrec2018-main-312","Investigating the Influence of Bilingual MWU on Trainee Translation Quality","10.63317\u002F5o5nuxzxq2ir","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-312","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F243.pdf","yuan-sharoff-2018-investigating",[6601,6603],{"paper_id":6594,"author_seq":247,"given_name":2406,"surname":6602,"affiliation":63,"orcid":63},"Yuan",{"paper_id":6594,"author_seq":232,"given_name":3443,"surname":3444,"affiliation":63,"orcid":63},"We applied a method for automatic extraction of bilingual multiword units (BMWUs) from a parallel corpus in order to investigate their contribution to translation quality in terms of adequacy and fluency. Our statistical analysis is based on generalized additive modelling. It has been shown that normalised BMWU ratios can be useful for estimating human translation quality. The normalized alignment ratios for BMWUs longer than two words have the greatest impact on measuring translation quality. It is also found that the alignment ratio for longer BMWUs is statistically closer to adequacy than to fluency.",{"paper_id":6606,"title":6607,"year":81,"month":855,"day":63,"doi":6608,"resource_url":6609,"first_page":63,"last_page":63,"pdf_url":6610,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6611,"paper_type":860,"authors":6612,"abstract":6619},"lrec2018-main-313","Evaluation of Dictionary Creating Methods for Finno-Ugric Minority Languages","10.63317\u002F34nyotif2k7s","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-313","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F312.pdf","ferenczi-etal-2018-evaluation",[6613,6616,6617,6618],{"paper_id":6606,"author_seq":247,"given_name":6614,"surname":6615,"affiliation":63,"orcid":63},"Zsanett","Ferenczi",{"paper_id":6606,"author_seq":232,"given_name":4713,"surname":4714,"affiliation":63,"orcid":63},{"paper_id":6606,"author_seq":218,"given_name":4708,"surname":1269,"affiliation":63,"orcid":63},{"paper_id":6606,"author_seq":203,"given_name":4705,"surname":4706,"affiliation":63,"orcid":63},"In this paper, we present the evaluation of several bilingual dictionary building methods applied to {Komi-Permyak, Komi-Zyrian, Hill Mari, Meadow Mari, Northern Saami, Udmurt}-{English, Finnish, Hungarian, Russian} language pairs. Since these Finno-Ugric minority languages are under-resourced and standard dictionary building methods require a large amount of pre-processed data, we had to find alternative methods. In a thorough evaluation, we compare the results for each method, which proved our expectations that the precision of standard lexicon building methods is quite low for under-resourced languages. However, utilizing Wikipedia title pairs extracted via inter-language links and Wiktionary-based methods provided useful results. The newly created word pairs enriched with several linguistic information are to be deployed on the web in the framework of Wiktionary. With our dictionaries, the number of Wiktionary entries in the above mentioned Finno-Ugric minority languages can be multiplied.",{"paper_id":6621,"title":6622,"year":81,"month":855,"day":63,"doi":6623,"resource_url":6624,"first_page":63,"last_page":63,"pdf_url":6625,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6626,"paper_type":860,"authors":6627,"abstract":6636},"lrec2018-main-314","Dysarthric speech evaluation: automatic and perceptual approaches","10.63317\u002F57pvtwp4ntjt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-314","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F498.pdf","laaridh-etal-2018-dysarthric",[6628,6631,6633],{"paper_id":6621,"author_seq":247,"given_name":6629,"surname":6630,"affiliation":63,"orcid":63},"Imed","Laaridh",{"paper_id":6621,"author_seq":232,"given_name":3328,"surname":6632,"affiliation":63,"orcid":63},"Meunier",{"paper_id":6621,"author_seq":218,"given_name":6634,"surname":6635,"affiliation":63,"orcid":63},"Corinne","Fredouille","Perceptual evaluation is still the most common method in clinical practice for the diagnosis and monitoring of the condition progression of people suffering from dysarthria (or speech disorders more generally). Such evaluations are frequently described as non-trivial, subjective and highly time-consuming (depending on the evaluation level). Clinicians have, therefore, expressed their need for new objective evaluation tools more adapted to longitudinal studies or rehabilitation context. We proposed earlier an automatic approach for the anomaly detection at the phone level for dysarthric speech. The system behavior was studied and validated on different corpora and speech styles. Nonetheless, the lack of annotated French dysarthric speech corpora has limited our capacity to analyze some aspects of its behavior, and notably,its severity (more anomalies detected automatically compared with human expert). To overcome this limitation, we proposed an original perceptual evaluation protocol applied to a limited set of decisions made by the automatic system, related to the presence of anomalies. This evaluation was carried out by a jury of 29 non-naive individuals. In addition to interesting information related to the system behavior, the evaluation protocol highlighted the difficulty for a human, even expert, to apprehend and detect deviations at the word level in dysarthric speech.",{"paper_id":6638,"title":6639,"year":81,"month":855,"day":63,"doi":6640,"resource_url":6641,"first_page":63,"last_page":63,"pdf_url":6642,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6643,"paper_type":860,"authors":6644,"abstract":6654},"lrec2018-main-315","Towards an Automatic Assessment of Crowdsourced Data for NLU","10.63317\u002F2esirisgya4j","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-315","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F539.pdf","braunger-etal-2018-towards",[6645,6648,6650,6652],{"paper_id":6638,"author_seq":247,"given_name":6646,"surname":6647,"affiliation":63,"orcid":63},"Patricia","Braunger",{"paper_id":6638,"author_seq":232,"given_name":1251,"surname":6649,"affiliation":63,"orcid":63},"Maier",{"paper_id":6638,"author_seq":218,"given_name":2633,"surname":6651,"affiliation":63,"orcid":63},"Wessling",{"paper_id":6638,"author_seq":203,"given_name":2301,"surname":6653,"affiliation":63,"orcid":63},"Schmidt","Recent development of spoken dialog systems has moved away from a command-style input and aims at allowing a natural input style. Obtaining suitable data for training and testing such systems is a significant challenge. We investigate with which methods data elicited via crowdsourcing can be assessed with respect to its naturalness and usefulness. Since the criteria with which to assess usefulness depend on the application purpose of crowdsourced data we investigate various facets such as noisy data, naturalness and building natural language understanding (NLU) models. Our results show that valid data can be automatically identified with the help of a word based language model. A comparison of crowdsourced data and system usage data on lexical, syntactic and pragmatic level reveals detailed information on the differences between both data sets. However, we show that using crowdsourced data for training NLU services achieves similar results as system usage data.",{"paper_id":6656,"title":6657,"year":81,"month":855,"day":63,"doi":6658,"resource_url":6659,"first_page":63,"last_page":63,"pdf_url":6660,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6661,"paper_type":860,"authors":6662,"abstract":6680},"lrec2018-main-316","Visual Choice of Plausible Alternatives: An Evaluation of Image-based Commonsense Causal Reasoning","10.63317\u002F4axuyktintv2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-316","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F560.pdf","yeo-etal-2018-visual",[6663,6666,6668,6670,6672,6674,6677],{"paper_id":6656,"author_seq":247,"given_name":6664,"surname":6665,"affiliation":63,"orcid":63},"Jinyoung","Yeo",{"paper_id":6656,"author_seq":232,"given_name":6667,"surname":4032,"affiliation":63,"orcid":63},"Gyeongbok",{"paper_id":6656,"author_seq":218,"given_name":6669,"surname":1588,"affiliation":63,"orcid":63},"Gengyu",{"paper_id":6656,"author_seq":203,"given_name":6671,"surname":1110,"affiliation":63,"orcid":63},"Seungtaek",{"paper_id":6656,"author_seq":188,"given_name":6673,"surname":4891,"affiliation":63,"orcid":63},"Hyunsouk",{"paper_id":6656,"author_seq":172,"given_name":6675,"surname":6676,"affiliation":63,"orcid":63},"Reinald","Kim Amplayo",{"paper_id":6656,"author_seq":155,"given_name":6678,"surname":6679,"affiliation":63,"orcid":63},"Seung-won","Hwang","This paper proposes the task of Visual COPA (VCOPA). Given a premise image and two alternative images, the task is to identify the more plausible alternative with their commonsense causal context. The VCOPA task is designed as its desirable machine system needs a more detailed understanding of the image, commonsense knowledge, and complex causal reasoning than state-of-the-art AI techniques. For that, we generate an evaluation dataset containing 380 VCOPA questions and over 1K images with various topics, which is amenable to automatic evaluation, and present the performance of baseline reasoning approaches as initial benchmarks for future systems.",{"paper_id":6682,"title":6683,"year":81,"month":855,"day":63,"doi":6684,"resource_url":6685,"first_page":63,"last_page":63,"pdf_url":6686,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6687,"paper_type":860,"authors":6688,"abstract":6697},"lrec2018-main-317","Is it worth it? Budget-related evaluation metrics for model selection","10.63317\u002F4z538t9j482t","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-317","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F615.pdf","klubicka-etal-2018-worth",[6689,6691,6694],{"paper_id":6682,"author_seq":247,"given_name":6208,"surname":6690,"affiliation":63,"orcid":63},"Klubička",{"paper_id":6682,"author_seq":232,"given_name":6692,"surname":6693,"affiliation":63,"orcid":63},"Giancarlo D.","Salton",{"paper_id":6682,"author_seq":218,"given_name":6695,"surname":6696,"affiliation":63,"orcid":63},"John D.","Kelleher","Creating a linguistic resource is often done by using a machine learning model that filters the content that goes through to a human annotator, before going into the final resource. However, budgets are often limited, and the amount of available data exceeds the amount of affordable annotation. In order to optimize the benefit from the invested human work, we argue that deciding on which model one should employ depends not only on generalized evaluation metrics such as F-score, but also on the gain metric. Because the model with the highest F-score may not necessarily have the best sequencing of predicted classes, this may lead to wasting funds on annotating false positives, yielding zero improvement of the linguistic resource. We exemplify our point with a case study, using real data from a task of building a verb-noun idiom dictionary. We show that, given the choice of three systems with varying F-scores, the system with the highest F-score does not yield the highest profits. In other words, in our case the cost-benefit trade off is more favorable for a system with a lower F-score.",{"paper_id":6699,"title":6700,"year":81,"month":855,"day":63,"doi":6701,"resource_url":6702,"first_page":63,"last_page":63,"pdf_url":6703,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6704,"paper_type":860,"authors":6705,"abstract":6712},"lrec2018-main-318","Automated Evaluation of Out-of-Context Errors","10.63317\u002F487pnos95qjt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-318","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F689.pdf","huber-etal-2018-automated",[6706,6708,6710],{"paper_id":6699,"author_seq":247,"given_name":2451,"surname":6707,"affiliation":63,"orcid":63},"Huber",{"paper_id":6699,"author_seq":232,"given_name":2633,"surname":6709,"affiliation":63,"orcid":63},"Niehues",{"paper_id":6699,"author_seq":218,"given_name":2050,"surname":6711,"affiliation":63,"orcid":63},"Waibel","We present a new approach to evaluate computational models for the task of text understanding by the means of out-of-context error detection. Through the novel design of our automated modification process, existing large-scale data sources can be adopted for a vast number of text understanding tasks. The data is thereby altered on a semantic level, allowing models to be tested against a challenging set of modified text passages that require to comprise a broader narrative discourse. Our newly introduced task targets actual real-world problems of transcription and translation systems by inserting authentic out-of-context errors. The automated modification process is applied to the 2016 TEDTalk corpus. Entirely automating the process allows the adoption of complete datasets at low cost, facilitating supervised learning procedures and deeper networks to be trained and tested. To evaluate the quality of the modification algorithm a language model and a supervised binary classification model are trained and tested on the altered dataset. A human baseline evaluation is examined to compare the results with human performance. The outcome of the evaluation task indicates the difficulty to detect semantic errors for machine-learning algorithms and humans, showing that the errors cannot be identified when limited to a single sentence.",{"paper_id":6714,"title":6715,"year":81,"month":855,"day":63,"doi":6716,"resource_url":6717,"first_page":63,"last_page":63,"pdf_url":6718,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6719,"paper_type":860,"authors":6720,"abstract":6733},"lrec2018-main-319","Matics Software Suite: New Tools for Evaluation and Data Exploration","10.63317\u002F28rukbzbwytr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-319","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F780.pdf","galibert-etal-2018-matics",[6721,6723,6724,6727,6730],{"paper_id":6714,"author_seq":247,"given_name":869,"surname":6722,"affiliation":63,"orcid":63},"Galibert",{"paper_id":6714,"author_seq":232,"given_name":2218,"surname":6386,"affiliation":63,"orcid":63},{"paper_id":6714,"author_seq":218,"given_name":6725,"surname":6726,"affiliation":63,"orcid":63},"Agnes","Delaborde",{"paper_id":6714,"author_seq":203,"given_name":6728,"surname":6729,"affiliation":63,"orcid":63},"Sabrina","Lecadre",{"paper_id":6714,"author_seq":188,"given_name":6731,"surname":6732,"affiliation":63,"orcid":63},"Juliette","Kahn","Matics is a free and open-source software suite for exploring annotated data and evaluation results.  It proposes a dataframe data model allowing the intuitive exploration of data characteristics and evaluation results and provides support for graphing the values and running appropriate statistical tests. The tools already run on several Natural Language Processing tasks and standard annotation formats,  and are under on-going development.",{"paper_id":6735,"title":6736,"year":81,"month":855,"day":63,"doi":6737,"resource_url":6738,"first_page":63,"last_page":63,"pdf_url":6739,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6740,"paper_type":860,"authors":6741,"abstract":6749},"lrec2018-main-320","MGAD: Multilingual Generation of Analogy Datasets","10.63317\u002F4pnisfacb4bm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-320","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1022.pdf","abdou-etal-2018-mgad",[6742,6743,6746],{"paper_id":6735,"author_seq":247,"given_name":5312,"surname":1300,"affiliation":63,"orcid":63},{"paper_id":6735,"author_seq":232,"given_name":6744,"surname":6745,"affiliation":63,"orcid":63},"Artur","Kulmizev",{"paper_id":6735,"author_seq":218,"given_name":6747,"surname":6748,"affiliation":63,"orcid":63},"Vinit","Ravishankar","We present a novel, minimally supervised method of generating word embedding evaluation datasets for a large number of languages. Our approach utilizes existing dependency treebanks and parsers in order to create language-specific syntactic analogy datasets that do not rely on translation or human annotation. As part of our work, we offer syntactic analogy datasets for three previously unexplored languages: Arabic, Hindi, and Russian. We further present an evaluation of three popular word embedding algorithms (Word2Vec,GloVe, LexVec) against these datasets and explore how the performance of each word embedding algorithm varies between several syntactic categories.",{"paper_id":6751,"title":6752,"year":81,"month":855,"day":63,"doi":6753,"resource_url":6754,"first_page":63,"last_page":63,"pdf_url":6755,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6756,"paper_type":860,"authors":6757,"abstract":6761},"lrec2018-main-321","MIsA: Multilingual “IsA” Extraction from Corpora","10.63317\u002F3zy8o5mqz6gw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-321","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F254.pdf","faralli-etal-2018-misa",[6758,6759,6760],{"paper_id":6751,"author_seq":247,"given_name":2733,"surname":2734,"affiliation":63,"orcid":63},{"paper_id":6751,"author_seq":232,"given_name":6095,"surname":6096,"affiliation":63,"orcid":63},{"paper_id":6751,"author_seq":218,"given_name":2741,"surname":2742,"affiliation":63,"orcid":63},"In this paper we present Multilingual IsA (MIsA), which is a collection of hypernymy relations in five languages (i.e.,  English, Spanish, French, Italian and Dutch) extracted from the corresponding full Wikipedia corpus. For each language, we first established a set of existing (viz. found in literature) or newly defined lexico-syntactic patterns. Similarly to WebIsADb, the resulting resource contains hypernymy relations represented as \"tuples\", as well as additional information such as provenance, context of the extraction, etc. To measure the precision of the patterns, we performed a manual assessment of the quality of the extracted relations and an error analysis.  In addition, we release the software developed for the extraction of the hypernym tuples.",{"paper_id":6763,"title":6764,"year":81,"month":855,"day":63,"doi":6765,"resource_url":6766,"first_page":63,"last_page":63,"pdf_url":6767,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6768,"paper_type":860,"authors":6769,"abstract":6779},"lrec2018-main-322","Biomedical term normalization of EHRs with UMLS","10.63317\u002F2nogdo73gqb9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-322","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F509.pdf","perez-miguel-etal-2018-biomedical",[6770,6773,6776],{"paper_id":6763,"author_seq":247,"given_name":6771,"surname":6772,"affiliation":63,"orcid":63},"Naiara","Perez-Miguel",{"paper_id":6763,"author_seq":232,"given_name":6774,"surname":6775,"affiliation":63,"orcid":63},"Montse","Cuadros",{"paper_id":6763,"author_seq":218,"given_name":6777,"surname":6778,"affiliation":63,"orcid":63},"German","Rigau","This paper presents a novel prototype for biomedical term normalization of electronic health record excerpts with the Unified Medical Language System (UMLS) Metathesaurus, a large, multi-lingual compendium of biomedical and health-related terminologies. Despite the prototype being multilingual and cross-lingual by design, we first focus on processing clinical text in Spanish because there is no existing tool for this language and for this specific purpose. The tool is based on Apache Lucene to index the Metathesaurus and generate mapping candidates from input text. It uses the IXA pipeline for basic language processing and resolves lexical ambiguities with the UKB toolkit. It has been evaluated by measuring its agreement with MetaMap --a mature software to discover UMLS concepts in English texts-- in two English-Spanish parallel corpora. In addition, we present a web-based interface for the tool.",{"paper_id":6781,"title":6782,"year":81,"month":855,"day":63,"doi":6783,"resource_url":6784,"first_page":63,"last_page":63,"pdf_url":6785,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6786,"paper_type":860,"authors":6787,"abstract":6793},"lrec2018-main-323","Revisiting the Task of Scoring Open IE Relations","10.63317\u002F2qe447uqn7co","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-323","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F574.pdf","lechelle-langlais-2018-revisiting",[6788,6791],{"paper_id":6781,"author_seq":247,"given_name":6789,"surname":6790,"affiliation":63,"orcid":63},"William","Léchelle",{"paper_id":6781,"author_seq":232,"given_name":2089,"surname":6792,"affiliation":63,"orcid":63},"Langlais","Knowledge Base Completion infers missing facts from existing ones in knowledge bases. As recent Open Information Extraction systems allow us to extract ever larger (yet incomplete) open-domain Knowledge Bases from text, we seek to probabilistically extend the limited coverage we get from existing facts, to arbitrary queries about plausible information. We propose a simple baseline, based on language modeling and trained with off-the-shelf programs, which gives competitive results in the previously defined protocol for this task, and provides an independent source of signal to judge arbitrary fact plausibility. We reexamine this protocol, measure the (large) impact of the negative example generation procedure, which we find to run contrary to the belief put forward in previous work. We conduct a small manual evaluation, giving insights into the rudimentary automatic evaluation protocol, and analyse the shortcomings of our model.",{"paper_id":6795,"title":6796,"year":81,"month":855,"day":63,"doi":6797,"resource_url":6798,"first_page":63,"last_page":63,"pdf_url":6799,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6800,"paper_type":860,"authors":6801,"abstract":6807},"lrec2018-main-324","A supervised approach to taxonomy extraction using word embeddings","10.63317\u002F4dycvo3s68zg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-324","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F601.pdf","sarkar-etal-2018-supervised",[6802,6805,6806],{"paper_id":6795,"author_seq":247,"given_name":6803,"surname":6804,"affiliation":63,"orcid":63},"Rajdeep","Sarkar",{"paper_id":6795,"author_seq":232,"given_name":3687,"surname":3688,"affiliation":63,"orcid":63},{"paper_id":6795,"author_seq":218,"given_name":3690,"surname":3691,"affiliation":63,"orcid":63},"Large collections of texts are commonly generated by large organizations and making sense of these collections of texts is a significant challenge. One method for handling this is to organize the concepts into a hierarchical structure such that similar concepts can be discovered and easily browsed. This approach was the subject of a recent evaluation campaign, TExEval, however the results of this task showed that none of the systems consistently outperformed a relatively simple baseline.In order to solve this issue, we propose a new method that uses supervised learning to combine multiple features with a support vector machine classifier including the baseline features. We show that this outperforms the baseline and thus provides a stronger method for identifying taxonomic relations than previous methods",{"paper_id":6809,"title":6810,"year":81,"month":855,"day":63,"doi":6811,"resource_url":6812,"first_page":63,"last_page":63,"pdf_url":6813,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6814,"paper_type":860,"authors":6815,"abstract":6819},"lrec2018-main-325","A Chinese Dataset with Negative Full Forms for General Abbreviation Prediction","10.63317\u002F48wusymdn5oo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-325","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F613.pdf","zhang-sun-2018-chinese",[6816,6818],{"paper_id":6809,"author_seq":247,"given_name":6817,"surname":2381,"affiliation":63,"orcid":63},"Yi",{"paper_id":6809,"author_seq":232,"given_name":1585,"surname":3774,"affiliation":63,"orcid":63},"Abbreviation is a common phenomenon across languages, especially in Chinese. In most cases, if an expression can be abbreviated, its abbreviation is used more often than its fully expanded forms, since people tend to convey information in a most concise way. For various language processing tasks, abbreviation is an obstacle to improving the performance, as the textual form of an abbreviation does not express useful information, unless it's expanded to the full form. Abbreviation prediction means associating the fully expanded forms with their abbreviations. However, due to the deficiency in the abbreviation corpora, such a task is limited in current studies, especially considering general abbreviation prediction should also include those full form expressions that do not have general abbreviations, namely the negative full forms (NFFs). Corpora incorporating negative full forms for general abbreviation prediction are few in number. In order to promote the research in this area, we build a dataset for general  Chinese abbreviation prediction, which needs a few preprocessing steps, and evaluate several different models on the built dataset. The dataset is available at https:\u002F\u002Fgithub.com\u002Flancopku\u002FChinese-abbreviation-dataset.",{"paper_id":6821,"title":6822,"year":81,"month":855,"day":63,"doi":6823,"resource_url":6824,"first_page":63,"last_page":63,"pdf_url":6825,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6826,"paper_type":860,"authors":6827,"abstract":6836},"lrec2018-main-326","Korean TimeBank Including Relative Temporal Information","10.63317\u002F2x3okcngus3b","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-326","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F664.pdf","lim-etal-2018-korean",[6828,6831,6834],{"paper_id":6821,"author_seq":247,"given_name":6829,"surname":6830,"affiliation":63,"orcid":63},"Chae-Gyun","Lim",{"paper_id":6821,"author_seq":232,"given_name":6832,"surname":6833,"affiliation":63,"orcid":63},"Young-Seob","Jeong",{"paper_id":6821,"author_seq":218,"given_name":6835,"surname":1110,"affiliation":63,"orcid":63},"Ho-Jin","Since most documents have temporal information that can be a basis for understanding the context, the importance of temporal information extraction researches is steadily growing. Although various attempts have been made to extract temporal information from researchers internationally, it is difficult to apply them to other languages because they are usually targeted at specific languages such as English. Several annotation languages and datasets had been proposed for the studies of temporal information extraction on Korean documents, however, the representation of relative temporal information is not enough to maintain it explicitly. In this paper, we propose a concept of relative temporal information and supplement a Korean annotation language to represent new relative expressions, and extend an annotated dataset, Korean TimeBank, through the revised language. We expect that it is possible to utilize potential features from the Korean corpus by clearly annotating relative temporal relationships and to use well-refined Korean TimeBank in future studies.",{"paper_id":6838,"title":6839,"year":81,"month":855,"day":63,"doi":6840,"resource_url":6841,"first_page":63,"last_page":63,"pdf_url":6842,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6843,"paper_type":860,"authors":6844,"abstract":6849},"lrec2018-main-327","Mining Biomedical Publications With The LAPPS Grid","10.63317\u002F328h4vy5y3tj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-327","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F666.pdf","ide-etal-2018-mining",[6845,6846,6847],{"paper_id":6838,"author_seq":247,"given_name":1400,"surname":1401,"affiliation":63,"orcid":63},{"paper_id":6838,"author_seq":232,"given_name":4661,"surname":4662,"affiliation":63,"orcid":63},{"paper_id":6838,"author_seq":218,"given_name":6848,"surname":1104,"affiliation":63,"orcid":63},"Jin-Dong","It is widely recognized that the ability to exploit Natural Language Processing (NLP) text mining strategies has the potential to increase productivity and innovation in the sciences by orders of magnitude, by enabling scientists to pull information from  research articles in scientific disciplines such as genomics and biomedicine.  The Language Applications (LAPPS) Grid  is an infrastructure for rapid development of natural language processing applications (NLP) that provides an ideal platform to support mining scientific literature. Its Galaxy interface and the interoperability among tools together provide an intuitive and easy-to-use platform, and users can experiment with and exploit NLP tools and resources without the need to determine which are suited to a particular task, and without the need for significant computer expertise. The LAPPS Grid has collaborated with the developers of PubAnnotation to integrate the services and resources provided by each in order to greatly enhance the user's ability to annotate scientific publications and share the results. This poster\u002Fdemo shows how the LAPPS Grid can facilitate mining scientific publications, including identification and extraction of relevant entities, relations, and events; iterative manual correction and evaluation of automatically-produced annotations, and customization of supporting resources to accommodate specific domains.",{"paper_id":6851,"title":6852,"year":81,"month":855,"day":63,"doi":6853,"resource_url":6854,"first_page":63,"last_page":63,"pdf_url":6855,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6856,"paper_type":860,"authors":6857,"abstract":6864},"lrec2018-main-328","An Initial Test Collection for Ranked Retrieval of SMS Conversations","10.63317\u002F58uds5hybjm9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-328","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F695.pdf","sankepally-oard-2018-initial",[6858,6861],{"paper_id":6851,"author_seq":247,"given_name":6859,"surname":6860,"affiliation":63,"orcid":63},"Rashmi","Sankepally",{"paper_id":6851,"author_seq":232,"given_name":6862,"surname":6863,"affiliation":63,"orcid":63},"Douglas W.","Oard","This paper describes a test collection for evaluating systems that search English SMS (Short Message Service) conversations. The collection is built from about 120,000 text messages. Topic development involved identifying typical types of information needs, then generating topics of each type for which relevant content might be found in the collection. Relevance judgments were then made for groups of messages that were most highly ranked by one or more of several ranked retrieval systems. The resulting TREC style test collection can be used to compare some alternative retrieval system designs.",{"paper_id":6866,"title":6867,"year":81,"month":855,"day":63,"doi":6868,"resource_url":6869,"first_page":63,"last_page":63,"pdf_url":6870,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6871,"paper_type":860,"authors":6872,"abstract":6887},"lrec2018-main-329","FrNewsLink : a corpus linking TV Broadcast News Segments and Press Articles","10.63317\u002F592zva5xsfoj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-329","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F709.pdf","camelin-etal-2018-frnewslink",[6873,6876,6877,6880,6883,6884],{"paper_id":6866,"author_seq":247,"given_name":6874,"surname":6875,"affiliation":63,"orcid":63},"Nathalie","Camelin",{"paper_id":6866,"author_seq":232,"given_name":1121,"surname":1122,"affiliation":63,"orcid":63},{"paper_id":6866,"author_seq":218,"given_name":6878,"surname":6879,"affiliation":63,"orcid":63},"Abdessalam","Bouchekif",{"paper_id":6866,"author_seq":203,"given_name":6881,"surname":6882,"affiliation":63,"orcid":63},"Anais","Landeau",{"paper_id":6866,"author_seq":188,"given_name":1130,"surname":1131,"affiliation":63,"orcid":63},{"paper_id":6866,"author_seq":172,"given_name":6885,"surname":6886,"affiliation":63,"orcid":63},"Yannick","Estève","In this article, we describe FrNewsLink, a corpus allowing to address several applicative tasks that we make publicly available. It gathers several resources from TV Broadcast News (TVBN) shows and press articles such as automatic transcription of TVBN shows, text extracted from on-line press articles, manual annotations for topic segmentation of TVBN shows and linking information between topic segments and press articles. The FrNewsLink corpus is based on 112 (TVBN) shows recorded during two periods in 2014 and 2015. Concomitantly, a set of 24,7k press articles has been gathered. Beyond topic segmentation, this corpus allows to study semantic similarity and multimedia News linking.",{"paper_id":6889,"title":6890,"year":81,"month":855,"day":63,"doi":6891,"resource_url":6892,"first_page":63,"last_page":63,"pdf_url":6893,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6894,"paper_type":860,"authors":6895,"abstract":6899},"lrec2018-main-330","PyRATA, Python Rule-based feAture sTructure Analysis","10.63317\u002F32aosrgo4x2x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-330","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F732.pdf","hernandez-hazem-2018-pyrata",[6896,6898],{"paper_id":6889,"author_seq":247,"given_name":1641,"surname":6897,"affiliation":63,"orcid":63},"Hernandez",{"paper_id":6889,"author_seq":232,"given_name":1797,"surname":1798,"affiliation":63,"orcid":63},"In this paper, we present a new Python 3 module named PyRATA, which stands for ”Python Rule-based feAture sTructure Analysis”. The module is released under the Apache V2 license. It aims at supporting rules-based analysis on structured data. PyRATA offers a language expressiveness which covers the functionalities of all the concurrent modules and more. Designed to be intuitive, the pattern syntax and the engine API follow existing standard definitions; Respectively Perl regular expression syntax and Python re module API. Using a simple native Python data structure (i.e. sequence of feature sets) allows it to deal with various kinds of data (textual or not) at various levels, such as a list of words, a list of sentences, a list of posts of a forum thread, a list of events of a calendar... This specificity makes it free from any (linguistic) process.",{"paper_id":6901,"title":6902,"year":81,"month":855,"day":63,"doi":6903,"resource_url":6904,"first_page":63,"last_page":63,"pdf_url":6905,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6906,"paper_type":860,"authors":6907,"abstract":6937},"lrec2018-main-331","Towards Processing of the Oral History Interviews and Related Printed Documents","10.63317\u002F2joemn8ghqdi","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-331","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F196.pdf","zajic-etal-2018-towards",[6908,6911,6914,6917,6919,6922,6925,6928,6930,6932,6934],{"paper_id":6901,"author_seq":247,"given_name":6909,"surname":6910,"affiliation":63,"orcid":63},"Zbyněk","Zajíc",{"paper_id":6901,"author_seq":232,"given_name":6912,"surname":6913,"affiliation":63,"orcid":63},"Lucie","Skorkovská",{"paper_id":6901,"author_seq":218,"given_name":6915,"surname":6916,"affiliation":63,"orcid":63},"Petr","Neduchal",{"paper_id":6901,"author_seq":203,"given_name":4670,"surname":6918,"affiliation":63,"orcid":63},"Ircing",{"paper_id":6901,"author_seq":188,"given_name":6920,"surname":6921,"affiliation":63,"orcid":63},"Josef V.","Psutka",{"paper_id":6901,"author_seq":172,"given_name":6923,"surname":6924,"affiliation":63,"orcid":63},"Marek","Hrúz",{"paper_id":6901,"author_seq":155,"given_name":6926,"surname":6927,"affiliation":63,"orcid":63},"Aleš","Pražák",{"paper_id":6901,"author_seq":138,"given_name":2554,"surname":6929,"affiliation":63,"orcid":63},"Soutner",{"paper_id":6901,"author_seq":121,"given_name":2633,"surname":6931,"affiliation":63,"orcid":63},"Švec",{"paper_id":6901,"author_seq":104,"given_name":5297,"surname":6933,"affiliation":63,"orcid":63},"Bureš",{"paper_id":6901,"author_seq":87,"given_name":6935,"surname":6936,"affiliation":63,"orcid":63},"Luděk","Müller","In this paper, we describe the initial stages of our project, the goal of which is to create an integrated archive of the recordings, scanned documents, and photographs that would be accessible online and would provide multifaceted search capabilities (spoken content, biographical information, relevant time period, etc.). The recordings contain retrospective interviews with the witnesses of the totalitarian regimes in Czechoslovakia, where the vocabulary used in such interviews consists of many archaic words and named entities that are now quite rare in everyday speech. The scanned documents consist of text materials and photographs mainly from the home archives of the interviewees or the archive of the State Security. These documents are usually typewritten or even handwritten and have really bad optical quality. In order to build an integrated archive, we will employ mainly methods of automatic speech recognition (ASR), automatic indexing and search in recognized recordings and, to a certain extent, also the optical character recognition (OCR). Other natural language processing techniques like topic detection are also planned to be used in the later stages of the project. This paper focuses on the processing of the speech data using ASR and the scanned typewritten documents with OCR and describes the initial experiments.",{"paper_id":6939,"title":6940,"year":81,"month":855,"day":63,"doi":6941,"resource_url":6942,"first_page":63,"last_page":63,"pdf_url":6943,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6944,"paper_type":860,"authors":6945,"abstract":6947},"lrec2018-main-332","Multi Modal Distance - An Approach to Stemma Generation With Weighting","10.63317\u002F2z83m8mxpbcx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-332","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F285.pdf","hoenen-2018-multi",[6946],{"paper_id":6939,"author_seq":247,"given_name":2501,"surname":2502,"affiliation":63,"orcid":63},"Stemma generation can be understood as a task where an original manuscript M gets copied and copies – due to the manual mode of copying – vary from each other and from M . Copies $M_1 , .. , M_k$ which survive historical loss serve as input to a mapping process estimating a directed acyclic graph (tree) which is the most likely representation of their copy history. One can first tokenize and align the texts of $M_1, .., M_k$ and then produce a pairwise distance matrix between them. From this, one can finally derive a tree with various methods, for instance Neighbor-Joining (NJ) (Saitou and Nei, 1987). For computing those matrices, previous research has applied unweighted approaches to token similarity (implicitly interpreting each token pair as a binary observation: identical or different), see Mooney et al. (2003). The effects of weighting have then been investigated and Spencer et al. (2004b) found them to be small in their (not necessarily all) scenario(s). The present approach goes beyond the token level and instead of a binary comparison uses a distance model on the basis of psycholinguistically gained distance matrices of letters in three modalities: vision, audition and motorics. Results indicate that this type of weighting have positive effects on stemma generation.",{"paper_id":6949,"title":6950,"year":81,"month":855,"day":63,"doi":6951,"resource_url":6952,"first_page":63,"last_page":63,"pdf_url":6953,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6954,"paper_type":860,"authors":6955,"abstract":6960},"lrec2018-main-333","A Corpus of Natural Multimodal Spatial Scene Descriptions","10.63317\u002F3xbacabz879n","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-333","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F296.pdf","han-schlangen-2018-corpus",[6956,6958],{"paper_id":6949,"author_seq":247,"given_name":6957,"surname":2403,"affiliation":63,"orcid":63},"Ting",{"paper_id":6949,"author_seq":232,"given_name":1199,"surname":6959,"affiliation":63,"orcid":63},"Schlangen","We present a corpus of multimodal spatial descriptions, a common scenario in route giving tasks. Participants provided natural spatial scene descriptions with speech and iconic\u002Fabstract deictic hand gestures. The scenes were composed of simple geometric objects. While the language denotes object shape and visual properties (e.g., colour), the abstract deictic gestures “placed” objects in gesture space to denote spatial relations of objects. Only together with speech do these gestures receive defined meanings. Hence, the presented corpus goes beyond previous work on gestures in multimodal interfaces that either focusses on gestures with predefined meanings (multimodal commands) or provides hand motion data without accompanying speech. At the same time, the setting is more constrained than full human\u002Fhuman interaction, making the resulting data more amenable to computational analysis and more directly useable for learning natural computer interfaces. Our preliminary analysis results show that co-verbal deictic gestures in the corpus reflect spatial configurations of objects, and there are variations of gesture space and verbal descriptions. The provided verbal descriptions and hand motion data will enable modelling the interpretations of natural multimodal descriptions with machine learning methods, as well as other tasks such as generating natural multimodal spatial descriptions.",{"paper_id":6962,"title":6963,"year":81,"month":855,"day":63,"doi":6964,"resource_url":6965,"first_page":63,"last_page":63,"pdf_url":6966,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6967,"paper_type":860,"authors":6968,"abstract":6978},"lrec2018-main-334","The Effects of Unimodal Representation Choices on Multimodal Learning","10.63317\u002F5di7or68yiw6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-334","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F330.pdf","ito-etal-2018-effects",[6969,6972,6975],{"paper_id":6962,"author_seq":247,"given_name":6970,"surname":6971,"affiliation":63,"orcid":63},"Fernando Tadao","Ito",{"paper_id":6962,"author_seq":232,"given_name":6973,"surname":6974,"affiliation":63,"orcid":63},"Helena de Medeiros","Caseli",{"paper_id":6962,"author_seq":218,"given_name":6976,"surname":6977,"affiliation":63,"orcid":63},"Jander","Moreira","Multimodal representations are distributed vectors that map multiple modes of information to a single mathematical space, where distances between instances delineate their similarity. In most cases, using a single unimodal representation technique is sufficient for each mode in the creation of multimodal spaces. In this paper, we investigate how different unimodal representations can be combined, and argue that the way they are combined can affect the performance, representation accuracy and classification metrics of other multimodal methods. In the experiments present in this paper, we used a dataset composed of images and text descriptions of products that have been extracted from an e-commerce site in Brazil. From this dataset, we tested our hypothesis in common classification problems to evaluate how multimodal representations can differ according to their component unimodal representation methods. For this domain, we selected eight methods of unimodal representation: LSI, LDA, Word2Vec, GloVe for text; SIFT, SURF, ORB and VGG19 for images. Multimodal representations were built by a multimodal deep autoencoder and a bidirectional deep neural network.",{"paper_id":6980,"title":6981,"year":81,"month":855,"day":63,"doi":6982,"resource_url":6983,"first_page":63,"last_page":63,"pdf_url":6984,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6985,"paper_type":860,"authors":6986,"abstract":6991},"lrec2018-main-335","An Evaluation Framework for Multimodal Interaction","10.63317\u002F4qhwop3jn5u3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-335","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F466.pdf","krishnaswamy-pustejovsky-2018-evaluation",[6987,6990],{"paper_id":6980,"author_seq":247,"given_name":6988,"surname":6989,"affiliation":63,"orcid":63},"Nikhil","Krishnaswamy",{"paper_id":6980,"author_seq":232,"given_name":1016,"surname":1017,"affiliation":63,"orcid":63},"In this paper we present a framework for evaluating interactions between a human user and an embodied virtual agent that communicates using natural language, gesture, and by executing actions in a shared context created through a visual simulation interface.        These interactions take place in real time and demonstrate collaboration between a human and a computer on object interaction and manipulation.  Our framework leverages the semantics of language and gesture to assess the level of mutual understanding during the interaction and the ease with which the two agents communicate.  We present initial results from trials involving construction tasks in a blocks world scenario and discuss extensions of the evaluation framework to more naturalistic and robust interactions.",{"paper_id":6993,"title":6994,"year":81,"month":855,"day":63,"doi":6995,"resource_url":6996,"first_page":63,"last_page":63,"pdf_url":6997,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":6998,"paper_type":860,"authors":6999,"abstract":7009},"lrec2018-main-336","The WAW Corpus: The First Corpus of Interpreted Speeches and their Translations for English and Arabic","10.63317\u002F4yztmfdo5eha","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-336","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F507.pdf","abdelali-etal-2018-waw",[7000,7001,7004,7007],{"paper_id":6993,"author_seq":247,"given_name":1154,"surname":1155,"affiliation":63,"orcid":63},{"paper_id":6993,"author_seq":232,"given_name":7002,"surname":7003,"affiliation":63,"orcid":63},"Irina","Temnikova",{"paper_id":6993,"author_seq":218,"given_name":7005,"surname":7006,"affiliation":63,"orcid":63},"Samy","Hedaya",{"paper_id":6993,"author_seq":203,"given_name":7008,"surname":4211,"affiliation":63,"orcid":63},"Stephan","This article presents the WAW Corpus, an interpreting corpus for English\u002FArabic, which can be used for teaching interpreters, studying the characteristics of interpreters’ work, as well as to train machine translation systems. The corpus contains recordings of lectures and speeches from international conferences, their interpretations, the transcripts of the original speeches and of their interpretations, as well as human translations of both kinds of transcripts into the opposite language of the language pair. The article presents the corpus curation, statistics, assessment, as well as a case study of the corpus use.",{"paper_id":7011,"title":7012,"year":81,"month":855,"day":63,"doi":7013,"resource_url":7014,"first_page":63,"last_page":63,"pdf_url":7015,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7016,"paper_type":860,"authors":7017,"abstract":7021},"lrec2018-main-337","Polish Corpus of Annotated Descriptions of Images","10.63317\u002F39sjov6tnwtn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-337","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F644.pdf","wroblewska-2018-polish",[7018],{"paper_id":7011,"author_seq":247,"given_name":7019,"surname":7020,"affiliation":63,"orcid":63},"Alina","Wróblewska","The paper presents a new dataset of image descriptions in Polish. The descriptions are morphosyntactically analysed and the pairs of these descriptions are annotated in terms of semantic relatedness and entailment. All annotations are provided by human annotators with strong linguistic background. The dataset can be used for evaluation of various systems integrating language and vision. It is applicable for evaluation of systems designed to image generation based on provided descriptions (text-to-image generation) or to caption generation based on images (image-to-text generation). Furthermore, as selected images are split into thematic groups, the dataset is also useful for validating image classification approaches.",{"paper_id":7023,"title":7024,"year":81,"month":855,"day":63,"doi":7025,"resource_url":7026,"first_page":63,"last_page":63,"pdf_url":7027,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7028,"paper_type":860,"authors":7029,"abstract":7042},"lrec2018-main-338","Action Verb Corpus","10.63317\u002F3w6k235xazdb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-338","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F693.pdf","gross-etal-2018-action",[7030,7032,7034,7037,7040],{"paper_id":7023,"author_seq":247,"given_name":1205,"surname":7031,"affiliation":63,"orcid":63},"Gross",{"paper_id":7023,"author_seq":232,"given_name":1244,"surname":7033,"affiliation":63,"orcid":63},"Hirschmanner",{"paper_id":7023,"author_seq":218,"given_name":7035,"surname":7036,"affiliation":63,"orcid":63},"Brigitte","Krenn",{"paper_id":7023,"author_seq":203,"given_name":7038,"surname":7039,"affiliation":63,"orcid":63},"Friedrich","Neubarth",{"paper_id":7023,"author_seq":188,"given_name":1780,"surname":7041,"affiliation":63,"orcid":63},"Zillich","The Action Verb Corpus comprises multimodal data of 12 humans conducting in total 390 simple actions (take, put, and push). Recorded are audio, video and motion data while participants perform an action and describe what they do. The dataset is annotated with the following information: orthographic transcriptions of utterances, part-of-speech tags, lemmata, information which object is currently moved, information whether a hand touches an object, information whether an object touches the ground\u002Ftable. Transcription, and information whether an object is in contact with a hand and which object moves where to were manually annotated, the rest was automatically annotated and manually corrected. In addition to the dataset, we present an algorithm for the challenging task of segmenting the stream of words into utterances, segmenting the visual input into a series of actions, and then aligning visual action information and speech. This kind of modality rich data is particularly important for crossmodal and cross-situational word-object and word-action learning in human-robot interactions, and is comparable to parent-toddler communication in early stages of child language acquisition.",{"paper_id":7044,"title":7045,"year":81,"month":855,"day":63,"doi":7046,"resource_url":7047,"first_page":63,"last_page":63,"pdf_url":7048,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7049,"paper_type":860,"authors":7050,"abstract":7062},"lrec2018-main-339","EMO&LY (EMOtion and AnomaLY) : A new corpus for anomaly detection in an audiovisual stream with emotional context.","10.63317\u002F2tm9f7km89yx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-339","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F714.pdf","fayet-etal-2018-emo",[7051,7054,7057,7060],{"paper_id":7044,"author_seq":247,"given_name":7052,"surname":7053,"affiliation":63,"orcid":63},"Cédric","Fayet",{"paper_id":7044,"author_seq":232,"given_name":7055,"surname":7056,"affiliation":63,"orcid":63},"Arnaud","Delhay",{"paper_id":7044,"author_seq":218,"given_name":7058,"surname":7059,"affiliation":63,"orcid":63},"Damien","Lolive",{"paper_id":7044,"author_seq":203,"given_name":7061,"surname":5884,"affiliation":63,"orcid":63},"Pierre-François","This paper presents a new corpus, called EMOLY (EMOtion and AnomaLY), composed of speech and facial video records of subjects that contains controlled anomalies.  As far as we know, to study the problem of anomaly detection in discourse by using machine learning classification techniques, no such corpus exists or is available to the community. In EMOLY, each subject is recorded three times in a recording studio, by filming his\u002Fher face and recording his\u002Fher voice with a HiFi microphone. Anomalies in discourse are induced or acted. At this time, about 8,65 hours of usable audiovisual recording on which we have tested classical classification techniques (GMM or One Class-SVM plus threshold classifier) are available. Results confirm the usability of the anomaly induction mechanism to produce anomalies in discourse and also the usability of the corpus to improve detection techniques.",{"paper_id":7064,"title":7065,"year":81,"month":855,"day":63,"doi":7066,"resource_url":7067,"first_page":63,"last_page":63,"pdf_url":7068,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7069,"paper_type":860,"authors":7070,"abstract":7076},"lrec2018-main-340","Development of an Annotated Multimodal Dataset for the Investigation of Classification and Summarisation of Presentations using High-Level Paralinguistic Features","10.63317\u002F2ezver24f9b9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-340","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F800.pdf","curtis-etal-2018-development",[7071,7073,7074],{"paper_id":7064,"author_seq":247,"given_name":4661,"surname":7072,"affiliation":63,"orcid":63},"Curtis",{"paper_id":7064,"author_seq":232,"given_name":3201,"surname":3202,"affiliation":63,"orcid":63},{"paper_id":7064,"author_seq":218,"given_name":1726,"surname":7075,"affiliation":63,"orcid":63},"Jones","Expanding online archives of presentation recordings provide potentially valuable resources for learning and research. However, the huge volume of data that is becoming available means that users have difficulty locating material which will be of most value to them. Conventional summarisation methods making use of text-based features derived from transcripts of spoken material can provide mechanisms to rapidly locate topically interesting material by reducing the amount of material that must be auditioned. However, these text-based methods take no account of the multimodal high-level paralinguistic features which form part of an audio-visual presentation, and can provide valuable indicators of the most interesting material within a presentation. We describe the development of a multimodal video dataset, recorded at an international conference, designed to support the exploration of automatic extraction of paralinguistic features and summarisation based on these features. The dataset is comprised of parallel recordings of the presenter and the audience for 31 conference presentations. We describe the process of performing manual annotation of high-level paralinguistic features for speaker ratings, audience engagement, speaker emphasis, and audience comprehension of these recordings. Used in combination these annotations enable research into the automatic classification of high-level paralinguistic features and their use in video summarisation.",{"paper_id":7078,"title":7079,"year":81,"month":855,"day":63,"doi":7080,"resource_url":7081,"first_page":63,"last_page":63,"pdf_url":7082,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7083,"paper_type":860,"authors":7084,"abstract":7087},"lrec2018-main-341","BKTreebank: Building a Vietnamese Dependency Treebank","10.63317\u002F2n9qmmemi7ic","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-341","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F69.pdf","nguyen-2018-bktreebank",[7085],{"paper_id":7078,"author_seq":247,"given_name":7086,"surname":1724,"affiliation":63,"orcid":63},"Kiem-Hieu","Dependency treebank is an important resource in any language. In this paper, we present our work on building BKTreebank, a dependency treebank for Vietnamese. Important points on designing POS tagset, dependency relations, and annotation guidelines are discussed. We describe experiments on POS tagging and dependency parsing on the treebank. Experimental results show that the treebank is a useful resource for Vietnamese language processing.",{"paper_id":7089,"title":7090,"year":81,"month":855,"day":63,"doi":7091,"resource_url":7092,"first_page":63,"last_page":63,"pdf_url":7093,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7094,"paper_type":860,"authors":7095,"abstract":7099},"lrec2018-main-342","GeCoTagger: Annotation of German Verb Complements with Conditional Random Fields","10.63317\u002F52cq38qd8cb7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-342","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F73.pdf","schneider-furbacher-2018-gecotagger",[7096,7097],{"paper_id":7089,"author_seq":247,"given_name":2083,"surname":5331,"affiliation":63,"orcid":63},{"paper_id":7089,"author_seq":232,"given_name":2653,"surname":7098,"affiliation":63,"orcid":63},"Fürbacher","Complement phrases are essential for constructing well-formed sentences in German. Identifying verb complements and categorizing complement classes is challenging even for linguists who are specialized in the field of verb valency. Against this background, we introduce an ML-based algorithm which is able to identify and classify complement phrases of any German verb in any written sentence context. We use a large training set consisting of example sentences from a valency dictionary, enriched with POS tagging, and the ML-based technique of Conditional Random Fields (CRF) to generate the classification models.",{"paper_id":7101,"title":7102,"year":81,"month":855,"day":63,"doi":7103,"resource_url":7104,"first_page":63,"last_page":63,"pdf_url":7105,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7106,"paper_type":860,"authors":7107,"abstract":7118},"lrec2018-main-343","AET: Web-based Adjective Exploration Tool for German","10.63317\u002F55d9xemq3xbx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-343","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F194.pdf","bladier-etal-2018-aet",[7108,7111,7114,7115],{"paper_id":7101,"author_seq":247,"given_name":7109,"surname":7110,"affiliation":63,"orcid":63},"Tatiana","Bladier",{"paper_id":7101,"author_seq":232,"given_name":7112,"surname":7113,"affiliation":63,"orcid":63},"Esther","Seyffarth",{"paper_id":7101,"author_seq":218,"given_name":1049,"surname":1050,"affiliation":63,"orcid":63},{"paper_id":7101,"author_seq":203,"given_name":7116,"surname":7117,"affiliation":63,"orcid":63},"Wiebke","Petersen","We present a new web-based corpus query tool, the Adjective Exploration Tool (AET), which enables research on the modificational behavior of German adjectives and adverbs. The tool can also be transferred to other languages and modification phenomena. The underlying database is derived from a corpus of German print media texts, which we annnotated with dependency parses and several morphological, lexical, and statistical properties of the tokens. We extracted pairs of adjectives and adverbs (modifiers) as well as the tokens modified by them (modifiees) from the corpus and stored them in a way that makes the modifier-modifiee pairs easily searchable. With AET, linguists from different research areas can access corpus samples using an intuitive query language and user-friendly web interface. AET has been developed as a part of a collaborative research project that focuses on the compositional interaction of attributive adjectives with nouns and the interplay of events and adverbial modifiers. The tool is easy to extend and update and is free to use online without registration: http:\u002F\u002Faet.phil.hhu.de",{"paper_id":7120,"title":7121,"year":81,"month":855,"day":63,"doi":7122,"resource_url":7123,"first_page":63,"last_page":63,"pdf_url":7124,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7125,"paper_type":860,"authors":7126,"abstract":7129},"lrec2018-main-344","ZAP: An Open-Source Multilingual Annotation Projection Framework","10.63317\u002F2f5psdsq3zn2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-344","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F301.pdf","akbik-vollgraf-2018-zap",[7127,7128],{"paper_id":7120,"author_seq":247,"given_name":2238,"surname":2239,"affiliation":63,"orcid":63},{"paper_id":7120,"author_seq":232,"given_name":2241,"surname":2242,"affiliation":63,"orcid":63},"Previous work leveraged annotation projection as a convenient method to automatically generate linguistic resources such as treebanks or propbanks for new languages. This approach automatically transfers linguistic annotation from a resource-rich source language (SL) to translations in a target language (TL). However, to the best of our knowledge, no publicly available framework for this approach currently exists, limiting researchers’ ability to reproduce and compare experiments. In this paper, we present ZAP, the first open-source framework for annotation projection in parallel corpora. Our framework is Java-based and includes methods for preprocessing corpora, computing word-alignments between sentence pairs, transferring different layers of linguistic annotation, and visualization. The framework was designed for ease-of-use with lightweight APIs. We give an overview of ZAP and illustrate its usage.",{"paper_id":7131,"title":7132,"year":81,"month":855,"day":63,"doi":7133,"resource_url":7134,"first_page":63,"last_page":63,"pdf_url":7135,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7136,"paper_type":860,"authors":7137,"abstract":7144},"lrec2018-main-345","Palmyra: A Platform Independent Dependency Annotation Tool for Morphologically Rich Languages","10.63317\u002F4rd5z89grbxd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-345","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F333.pdf","javed-etal-2018-palmyra",[7138,7141,7142],{"paper_id":7131,"author_seq":247,"given_name":7139,"surname":7140,"affiliation":63,"orcid":63},"Talha","Javed",{"paper_id":7131,"author_seq":232,"given_name":3646,"surname":3647,"affiliation":63,"orcid":63},{"paper_id":7131,"author_seq":218,"given_name":6112,"surname":7143,"affiliation":63,"orcid":63},"Taji","We present Palmyra, a platform independent graphical dependency tree visualization and editing software. Palmyra has been specifically designed to support the complexities of syntactic annotation of morphologically rich languages, especially regarding easy change of morphological tokenization through edits, additions, deletions, splits and merges of words. Palmyra uses an intuitive drag-and-drop interface for editing tree structures, and provides pop-up boxes and keyboard shortcuts for part-of-speech and link label tagging.",{"paper_id":7146,"title":7147,"year":81,"month":855,"day":63,"doi":7148,"resource_url":7149,"first_page":63,"last_page":63,"pdf_url":7150,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7151,"paper_type":860,"authors":7152,"abstract":7158},"lrec2018-main-346","A Web-based System for Crowd-in-the-Loop Dependency Treebanking","10.63317\u002F3d9jdgb5c4ab","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-346","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F339.pdf","tratz-phan-2018-web",[7153,7155],{"paper_id":7146,"author_seq":247,"given_name":2586,"surname":7154,"affiliation":63,"orcid":63},"Tratz",{"paper_id":7146,"author_seq":232,"given_name":7156,"surname":7157,"affiliation":63,"orcid":63},"Nhien","Phan","Treebanks exist for many different languages, but they are often quite limited in terms of size, genre, and topic coverage. It is difficult to expand these treebanks or to develop new ones in part because manual annotation is time-consuming and expensive. Human-in-the-loop methods that leverage machine learning algorithms during the annotation process are one set of techniques that could be employed to accelerate annotation of large numbers of sentences. Additionally, crowdsourcing could be used to hire a large number of annotators at relatively low cost. Currently, there are few treebanking tools available that support either human-in-the-loop methods or crowdsourcing. To address this, we introduce CrowdTree , a web-based interactive tool for editing dependency trees. In addition to the visual frontend, the system has a Java servlet that can train a parsing model during the annotation process. This parsing model can then be applied to sentences as they are requested by annotators so that, instead of annotating sentences from scratch, annotators need only to edit the model’s predictions, potentially resulting in significant time savings. Multiple annotators can work simultaneously, and the system is even designed to be compatible with Mechanical Turk. Thus, CrowdTree supports not simply human-in-the-loop treebanking, but crowd-in-the-loop treebanking.",{"paper_id":7160,"title":7161,"year":81,"month":855,"day":63,"doi":7162,"resource_url":7163,"first_page":63,"last_page":63,"pdf_url":7164,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7165,"paper_type":860,"authors":7166,"abstract":7176},"lrec2018-main-347","Building Universal Dependency Treebanks in Korean","10.63317\u002F3nae5kzry287","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-347","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F378.pdf","chun-etal-2018-building",[7167,7170,7172,7174],{"paper_id":7160,"author_seq":247,"given_name":7168,"surname":7169,"affiliation":63,"orcid":63},"Jayeol","Chun",{"paper_id":7160,"author_seq":232,"given_name":7171,"surname":2403,"affiliation":63,"orcid":63},"Na-Rae",{"paper_id":7160,"author_seq":218,"given_name":7173,"surname":6679,"affiliation":63,"orcid":63},"Jena D.",{"paper_id":7160,"author_seq":203,"given_name":7175,"surname":1110,"affiliation":63,"orcid":63},"Jinho D.","This paper presents three treebanks in Korean that consist of dependency trees derived from existing treebanks, the Google UD Treebank, the Penn Korean Treebank, and the Kaist Treebank, and pseudo-annotated by the latest guidelines from the Universal Dependencies (UD) project. The Korean portion of the Google UD Treebank is re-tokenized to match the morpheme-level annotation suggested by the other corpora, and systematically assessed for errors. Phrase structure trees in the Penn Korean Treebank and the Kaist Treebank are automatically converted into dependency trees using head-finding rules and linguistic heuristics. Additionally, part-of-speech tags in all treebanks are converted into the UD tagset. A total of 38K+ dependency trees are generated that comprise a coherent set of dependency relations for over a half million tokens. To the best of our knowledge, this is the first time that these Korean corpora are analyzed together and transformed into dependency trees following the latest UD guidelines, version 2.",{"paper_id":7178,"title":7179,"year":81,"month":855,"day":63,"doi":7180,"resource_url":7181,"first_page":63,"last_page":63,"pdf_url":7182,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7183,"paper_type":860,"authors":7184,"abstract":7192},"lrec2018-main-348","Moving TIGER beyond Sentence-Level","10.63317\u002F2pifc25rjmjz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-348","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F440.pdf","falenska-etal-2018-moving",[7185,7188,7191],{"paper_id":7178,"author_seq":247,"given_name":7186,"surname":7187,"affiliation":63,"orcid":63},"Agnieszka","Falenska",{"paper_id":7178,"author_seq":232,"given_name":7189,"surname":7190,"affiliation":63,"orcid":63},"Kerstin","Eckart",{"paper_id":7178,"author_seq":218,"given_name":1278,"surname":4138,"affiliation":63,"orcid":63},"We present TIGER 2.2-doc -- a new set of annotations for the German TIGER corpus. The set moves the corpus to a document level. It includes a full mapping of sentences to documents, as well as additional sentence-level and document-level annotations. The sentence-level annotations refer to the role of a sentence in the document. They introduce structure to the TIGER documents by separating headers and meta-level information from article content. Document-level annotations recover information which has been neglected in the intermediate releases of the TIGER corpus, such as document categories and publication dates of the articles. Additionally, we introduce new document-level annotations: authors and their gender. We describe the process of corpus annotation, show statistics of the obtained data and present baseline experiments for lemmatization, part-of-speech and morphological tagging, and dependency parsing. Finally, we present two example use cases: sentence boundary detection and authorship attribution. These use cases take the data from TIGER into account and illustrate the usefulness of the new annotation layers from TIGER 2.2-doc.",{"paper_id":7194,"title":7195,"year":81,"month":855,"day":63,"doi":7196,"resource_url":7197,"first_page":63,"last_page":63,"pdf_url":7198,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7199,"paper_type":860,"authors":7200,"abstract":7207},"lrec2018-main-349","Spanish HPSG Treebank based on the AnCora Corpus","10.63317\u002F32zxkn44c5y9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-349","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F543.pdf","chiruzzo-wonsever-2018-spanish",[7201,7204],{"paper_id":7194,"author_seq":247,"given_name":7202,"surname":7203,"affiliation":63,"orcid":63},"Luis","Chiruzzo",{"paper_id":7194,"author_seq":232,"given_name":7205,"surname":7206,"affiliation":63,"orcid":63},"Dina","Wonsever","This paper describes a corpus of HPSG annotated trees for Spanish that contains morphosyntactic information, annotations for semantic roles, clitic pronouns and relative clauses. The corpus is based on the Spanish AnCora corpus, which contains trees for 17,000 sentences comprising half a million words, and it has CFG style annotations. The corpus is stored in two different formats: An XML dialect that is the direct serialization of the typed feature structure trees, and an HTML format that is suitable for visualizing the trees in a browser.",{"paper_id":7209,"title":7210,"year":81,"month":855,"day":63,"doi":7211,"resource_url":7212,"first_page":63,"last_page":63,"pdf_url":7213,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7214,"paper_type":860,"authors":7215,"abstract":7223},"lrec2018-main-350","Universal Dependencies for Amharic","10.63317\u002F59idak9jiu75","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-350","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F565.pdf","seyoum-etal-2018-universal",[7216,7219,7220],{"paper_id":7209,"author_seq":247,"given_name":7217,"surname":7218,"affiliation":63,"orcid":63},"Binyam Ephrem","Seyoum",{"paper_id":7209,"author_seq":232,"given_name":6148,"surname":6149,"affiliation":63,"orcid":63},{"paper_id":7209,"author_seq":218,"given_name":7221,"surname":7222,"affiliation":63,"orcid":63},"Baye Yimam","Mekonnen","In this paper, we describe the process of creating an Amharic Dependency Treebank, which is the first attempt to introduce Universal Dependencies (UD) into Amharic. Amharic is a morphologically-rich and less-resourced language within the Semitic language family. In Amharic, an orthographic word may be bundled with information other than morphology. There are some clitics attached to major lexical categories with grammatical functions. We first explain the segmentation of clitics, which is problematic to retrieve from the orthographic word due to morpheme co-occurrence restriction, assimilation and ambiguity of the clitics. Then, we describe the annotation processes for POS tagging, morphological information and dependency relations. Based on this, we have created a Treebank of 1,096 sentences.",{"paper_id":7225,"title":7226,"year":81,"month":855,"day":63,"doi":7227,"resource_url":7228,"first_page":63,"last_page":63,"pdf_url":7229,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7230,"paper_type":860,"authors":7231,"abstract":7236},"lrec2018-main-351","A Parser for LTAG and Frame Semantics","10.63317\u002F3tbg8mumqv82","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-351","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F567.pdf","arps-petitjean-2018-parser",[7232,7234],{"paper_id":7225,"author_seq":247,"given_name":1199,"surname":7233,"affiliation":63,"orcid":63},"Arps",{"paper_id":7225,"author_seq":232,"given_name":1269,"surname":7235,"affiliation":63,"orcid":63},"Petitjean","Since the idea of combining Lexicalized Tree Adjoining Grammars (LTAG) and frame semantics was proposed (Kallmeyer and Osswald, 2013), a set of resources of this type has been created. These grammars are composed of pairs of elementary trees and frames, where syntactic and semantic arguments are linked using unification variables. This allows to build semantic representations when parsing, by composing the frames according to the combination of the elementary trees. However, the lack of a parser using such grammars makes it complicated to check whether these resources are correct implementations of the theory or not. The development of larger resources, that is to say large-coverage grammars, is also conditioned by the existence of such a parser. In this paper, we present our solution to this problem, namely an extension of the TuLiPA parser with frame semantics. We also present the frameworks used to build the resources used by the parser: the theoretical framework, composed of LTAG and frame semantics, and the software framework, XMG2.",{"paper_id":7238,"title":7239,"year":81,"month":855,"day":63,"doi":7240,"resource_url":7241,"first_page":63,"last_page":63,"pdf_url":7242,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7243,"paper_type":860,"authors":7244,"abstract":7251},"lrec2018-main-352","Multilingual Dependency Parsing for Low-Resource Languages: Case Studies on North Saami and Komi-Zyrian","10.63317\u002F2j7vvhwp5fwc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-352","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F600.pdf","lim-etal-2018-multilingual",[7245,7247,7249],{"paper_id":7238,"author_seq":247,"given_name":7246,"surname":6830,"affiliation":63,"orcid":63},"KyungTae",{"paper_id":7238,"author_seq":232,"given_name":2504,"surname":7248,"affiliation":63,"orcid":63},"Partanen",{"paper_id":7238,"author_seq":218,"given_name":881,"surname":7250,"affiliation":63,"orcid":63},"Poibeau","The paper presents a method for parsing low-resource languages with very small training corpora using multilingual word embeddings and annotated corpora of larger languages. The study demonstrates that specific language combinations enable improved dependency parsing when compared to previous work, allowing for wider reuse of pre-existing resources when parsing low-resource languages. The study also explores the question of whether contemporary contact languages or genetically related languages would be the most fruitful starting point for multilingual parsing scenarios.",{"paper_id":7253,"title":7254,"year":81,"month":855,"day":63,"doi":7255,"resource_url":7256,"first_page":63,"last_page":63,"pdf_url":7257,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7258,"paper_type":860,"authors":7259,"abstract":7264},"lrec2018-main-353","FonBund: A Library for Combining Cross-lingual Phonological Segment Data","10.63317\u002F448aqn8uvphj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-353","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8889.pdf","gutkin-etal-2018-fonbund",[7260,7261,7262],{"paper_id":7253,"author_seq":247,"given_name":2736,"surname":5582,"affiliation":63,"orcid":63},{"paper_id":7253,"author_seq":232,"given_name":1248,"surname":5584,"affiliation":63,"orcid":63},{"paper_id":7253,"author_seq":218,"given_name":7109,"surname":7263,"affiliation":63,"orcid":63},"Merkulova","We present an open-source library (FonBund) that provides a way of mapping sequences of arbitrary phonetic segments in International Phonetic Alphabet (IPA) into multiple articulatory feature representations. The library interfaces with several existing linguistic typology resources providing phonological segment inventories and their corresponding articulatory feature systems. Our first goal was to facilitate the derivation of articulatory features without giving a special preference to any particular phonological segment inventory provided by freely available linguistic typology resources. The second goal was to build a very light-weight library that can be easily modified to support new phonological segment inventories. In order to support IPA segments that do not occur in the freely available resources, the library provides a simple configuration language for performing segment rewrites and adding custom segments with the corresponding feature structures. In addition to introducing the library and the corresponding linguistic resources, we also describe some of the practical uses of this library (multilingual speech synthesis) in the hope that this software will help facilitate multilingual speech research.",{"paper_id":7266,"title":7267,"year":81,"month":855,"day":63,"doi":7268,"resource_url":7269,"first_page":63,"last_page":63,"pdf_url":7270,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7271,"paper_type":860,"authors":7272,"abstract":7281},"lrec2018-main-354","Voice Builder: A Tool for Building Text-To-Speech Voices","10.63317\u002F3b2pkzfxczhi","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-354","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8887.pdf","de-silva-etal-2018-voice",[7273,7275,7278,7280],{"paper_id":7266,"author_seq":247,"given_name":5685,"surname":7274,"affiliation":63,"orcid":63},"De Silva",{"paper_id":7266,"author_seq":232,"given_name":7276,"surname":7277,"affiliation":63,"orcid":63},"Theeraphol","Wattanavekin",{"paper_id":7266,"author_seq":218,"given_name":7279,"surname":2717,"affiliation":63,"orcid":63},"Tang",{"paper_id":7266,"author_seq":203,"given_name":5573,"surname":5574,"affiliation":63,"orcid":63},"We describe an opensource text-to-speech (TTS) voice building tool that focuses on simplicity, flexibility, and collaboration. Our tool allows anyone with basic computer skills to run voice training experiments and listen to the resulting synthesized voice. We hope that this tool will reduce the barrier for creating new voices and accelerate TTS research, by making experimentation faster and interdisciplinary collaboration easier. We believe that our tool can help improve TTS research, especially for low-resourced languages, where more experimentations are often needed to get the most out of the limited language resources.",{"paper_id":7283,"title":7284,"year":81,"month":855,"day":63,"doi":7285,"resource_url":7286,"first_page":63,"last_page":63,"pdf_url":7287,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7288,"paper_type":860,"authors":7289,"abstract":7305},"lrec2018-main-355","Sudachi: a Japanese Tokenizer for Business","10.63317\u002F2hsnuu4nxsnq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-355","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8884.pdf","takaoka-etal-2018-sudachi",[7290,7293,7296,7298,7301,7304],{"paper_id":7283,"author_seq":247,"given_name":7291,"surname":7292,"affiliation":63,"orcid":63},"Kazuma","Takaoka",{"paper_id":7283,"author_seq":232,"given_name":7294,"surname":7295,"affiliation":63,"orcid":63},"Sorami","Hisamoto",{"paper_id":7283,"author_seq":218,"given_name":7297,"surname":1880,"affiliation":63,"orcid":63},"Noriko",{"paper_id":7283,"author_seq":203,"given_name":7299,"surname":7300,"affiliation":63,"orcid":63},"Miho","Sakamoto",{"paper_id":7283,"author_seq":188,"given_name":7302,"surname":7303,"affiliation":63,"orcid":63},"Yoshitaka","Uchida",{"paper_id":7283,"author_seq":172,"given_name":3649,"surname":2464,"affiliation":63,"orcid":63},"This paper presents Sudachi, a Japanese tokenizer and its accompanying language resources for business use. Tokenization, or morphological analysis, is a fundamental and important technology for processing a Japanese text, especially for industrial applications. However, we often face many obstacles for Japanese tokenization, such as the inconsistency of token unit in different resources, notation variations, discontinued maintenance of the resources, and various issues with the existing tokenizer implementations. In order to improve this situation, we develop a new tokenizer and a dictionary with features such as multi-granular output for different purposes and normalization of notation variations. In addition to this, we are planning to continuously maintain our software and resource in long-term as a part of the company business. We release the resulting tokenizer software and language resources freely available to the public as an open source software. You can access them at https:\u002F\u002Fgithub.com\u002FWorksApplications\u002FSudachi.",{"paper_id":7307,"title":7308,"year":81,"month":855,"day":63,"doi":7309,"resource_url":7310,"first_page":63,"last_page":63,"pdf_url":7311,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7312,"paper_type":860,"authors":7313,"abstract":7325},"lrec2018-main-356","Chemical Compounds Knowledge Visualization with Natural Language Processing and Linked Data","10.63317\u002F49ypebonuivx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-356","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F8886.pdf","tanaka-etal-2018-chemical",[7314,7316,7319,7321,7323,7324],{"paper_id":7307,"author_seq":247,"given_name":7315,"surname":6146,"affiliation":63,"orcid":63},"Kazunari",{"paper_id":7307,"author_seq":232,"given_name":7317,"surname":7318,"affiliation":63,"orcid":63},"Tomoya","Iwakura",{"paper_id":7307,"author_seq":218,"given_name":6148,"surname":7320,"affiliation":63,"orcid":63},"Koyanagi",{"paper_id":7307,"author_seq":203,"given_name":7297,"surname":7322,"affiliation":63,"orcid":63},"Ikeda",{"paper_id":7307,"author_seq":188,"given_name":3651,"surname":4122,"affiliation":63,"orcid":63},{"paper_id":7307,"author_seq":172,"given_name":3649,"surname":2464,"affiliation":63,"orcid":63},"Knowledge of chemical compounds is invaliable for developing new materials, new drugs, and so on. Therefore, databases of chemical compounds are being created. For example, CAS, one of the largest databases, includes over 100 million chemical compound information. However, the creation of such databases strongly depends on manual labor since chemical compounds are being produced at every moment. In addition, the database creation mainly focuses on English text. Therefore, in other words, chemical compound information other than English is not good enough to be available.  For example, although Japan has one of the largest chemical industries and has large chemical compound information written in Japanese text documents, such information is not exploited well so far.  We propose a visualization system based on chemical compound extraction results with Japanese Natural Language Processing and structured databases represented as Linked Data (LD).  Figure 1 shows an overview of our system. First, chemical compound names in text are recognized. Then, aliases of chemical compound names are identified. The extraction results and existing chemical compound databases are represented as LD. By combining these LD-based chemical compound knowledge, our system provides different views of chemical compounds.",{"paper_id":7327,"title":7328,"year":81,"month":855,"day":63,"doi":7329,"resource_url":7330,"first_page":63,"last_page":63,"pdf_url":7331,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7332,"paper_type":860,"authors":7333,"abstract":7340},"lrec2018-main-357","Using Discourse Information for Education with a Spanish-Chinese Parallel Corpus","10.63317\u002F2f56eszvui9x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-357","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F57.pdf","cao-gete-2018-using",[7334,7337],{"paper_id":7327,"author_seq":247,"given_name":7335,"surname":7336,"affiliation":63,"orcid":63},"Shuyuan","Cao",{"paper_id":7327,"author_seq":232,"given_name":7338,"surname":7339,"affiliation":63,"orcid":63},"Harritxu","Gete","Nowadays, with the fruitful achievements in Natural Language Processing (NLP) studies, the concern of using NLP technologies for education has called much attention. As two of the most spoken languages in the world, Spanish and Chinese occupy important positions in both NLP studies and bilingual education. In this paper, we present a Spanish-Chinese parallel corpus with annotated discourse information that aims to serve for bilingual language education. The theoretical framework of this work is Rhetorical Structure Theory (RST). The corpus is composed of 100 Spanish-Chinese parallel texts, and all the discourse markers (DM) have been annotated to form the education source. With pedagogical aim, we also present two programs that generate automatic exercises for both Spanish and Chinese students using our corpus. The reliability of this work has been evaluated using Kappa coefficient.",{"paper_id":7342,"title":7343,"year":81,"month":855,"day":63,"doi":7344,"resource_url":7345,"first_page":63,"last_page":63,"pdf_url":7346,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7347,"paper_type":860,"authors":7348,"abstract":7352},"lrec2018-main-358","A 2nd Longitudinal Corpus for Children’s Writing with Enhanced Output for Specific Spelling Patterns","10.63317\u002F43odf8w8xmt9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-358","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F60.pdf","berkling-2018-2nd",[7349],{"paper_id":7342,"author_seq":247,"given_name":7350,"surname":7351,"affiliation":63,"orcid":63},"Kay","Berkling","This paper describes the collection of three longitudinal Corpora of German school children's weekly writing in German, called H2 (H1 is available via LDC and contains some of the same students' writing 2 years previously), E2 (E1 is not public), and ERK1. The texts were written within the normal classroom setting. Texts of children whose parents signed the permission to donate the texts to science were collected and transcribed. The corpus consists of the elicitation techniques, an overview of the data collected and the  transcriptions of the texts both with and without spelling errors, aligned on a word by word basis. In addition, the hand-written texts were scanned in. The corpus is available for research via Linguistic Data Consortium (LDC). When using this Corpus, researchers are strongly encouraged to make additional annotations and improvements and return it to the public domain via LDC, especially since this effort was unfunded.",{"paper_id":7354,"title":7355,"year":81,"month":855,"day":63,"doi":7356,"resource_url":7357,"first_page":63,"last_page":63,"pdf_url":7358,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7359,"paper_type":860,"authors":7360,"abstract":7370},"lrec2018-main-359","Development of a Mobile Observation Support System for Students: FishWatchr Mini","10.63317\u002F2ahyab3mdm3p","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-359","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F166.pdf","yamaguchi-etal-2018-development",[7361,7364,7367],{"paper_id":7354,"author_seq":247,"given_name":7362,"surname":7363,"affiliation":63,"orcid":63},"Masaya","Yamaguchi",{"paper_id":7354,"author_seq":232,"given_name":7365,"surname":7366,"affiliation":63,"orcid":63},"Masanori","Kitamura",{"paper_id":7354,"author_seq":218,"given_name":7368,"surname":7369,"affiliation":63,"orcid":63},"Naomi","Yanagida","This paper presents a system called FishWatchr Mini (FWM), which supports students in observing and reflecting on educational activities such as presentation practices.  Although video annotation systems are supposed to support such activities, especially reflection, they have been used by researchers and teachers outside the classroom, rather than by students inside the classroom. To resolve problems with introducing video annotation systems in the classroom, we propose three solutions: (a) to facilitate preparation of devices, FWM is implemented as a Web application on students' smartphones; (b) to facilitate students' learning to use the system, FWM is designed to automate as many operations as possible, apart from annotation; and (c) FWM allows students to examine annotation data through reflection, by providing functions such as visualization.  The results of applying FWM to presentation practices in a university validated the effectiveness of FWM in terms of its ease of use and applicability of annotation results. Since annotated video data could also provide language resources for teachers, it is anticipated that they will contribute to improving their classes in the future.",{"paper_id":7372,"title":7373,"year":81,"month":855,"day":63,"doi":7374,"resource_url":7375,"first_page":63,"last_page":63,"pdf_url":7376,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7377,"paper_type":860,"authors":7378,"abstract":7393},"lrec2018-main-360","The AnnCor CHILDES Treebank","10.63317\u002F3ffgfmpqcts5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-360","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F223.pdf","odijk-etal-2018-anncor",[7379,7381,7383,7386,7387,7390],{"paper_id":7372,"author_seq":247,"given_name":2633,"surname":7380,"affiliation":63,"orcid":63},"Odijk",{"paper_id":7372,"author_seq":232,"given_name":1127,"surname":7382,"affiliation":63,"orcid":63},"Dimitriadis",{"paper_id":7372,"author_seq":218,"given_name":7384,"surname":7385,"affiliation":63,"orcid":63},"Martijn","van der Klis",{"paper_id":7372,"author_seq":203,"given_name":4284,"surname":4285,"affiliation":63,"orcid":63},{"paper_id":7372,"author_seq":188,"given_name":7388,"surname":7389,"affiliation":63,"orcid":63},"Meie","Otten",{"paper_id":7372,"author_seq":172,"given_name":7391,"surname":7392,"affiliation":63,"orcid":63},"Remco","van der Veen","This paper (1) presents the first partially manually verified treebank for Dutch CHILDES corpora, the AnnCor CHILDES Treebank; (2) argues explicitly that it is useful to assign adult grammar syntactic structures to utterances of children  who are still in the process of acquiring the language; (3) argues that human annotation and automatic checks on this annotation must go hand  in hand; (4) argues that explicit annotation guidelines and conventions must be developed and adhered to and emphasises  consistency of the annotations as an important desirable property for annotations. It also describes  the tools used for annotation and automated checks on edited syntactic structures, as well as  extensions to  an existing treebank query application (GrETEL) and  the multiple formats in which the resources will be made available.",{"paper_id":7395,"title":7396,"year":81,"month":855,"day":63,"doi":7397,"resource_url":7398,"first_page":63,"last_page":63,"pdf_url":7399,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7400,"paper_type":860,"authors":7401,"abstract":7417},"lrec2018-main-361","BabyCloud, a Technological Platform for Parents and Researchers","10.63317\u002F4i4owwveu9x5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-361","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F316.pdf","cao-etal-2018-babycloud",[7402,7404,7407,7409,7412,7415],{"paper_id":7395,"author_seq":247,"given_name":7403,"surname":7336,"affiliation":63,"orcid":63},"Xuân-Nga",{"paper_id":7395,"author_seq":232,"given_name":7405,"surname":7406,"affiliation":63,"orcid":63},"Cyrille","Dakhlia",{"paper_id":7395,"author_seq":218,"given_name":6646,"surname":7408,"affiliation":63,"orcid":63},"Del Carmen",{"paper_id":7395,"author_seq":203,"given_name":7410,"surname":7411,"affiliation":63,"orcid":63},"Mohamed-Amine","Jaouani",{"paper_id":7395,"author_seq":188,"given_name":7413,"surname":7414,"affiliation":63,"orcid":63},"Malik","Ould-Arbi",{"paper_id":7395,"author_seq":172,"given_name":2211,"surname":7416,"affiliation":63,"orcid":63},"Dupoux","In this paper, we present BabyCloud, a platform for capturing, storing and analyzing daylong audio recordings and photographs of children's linguistic environments, for the purpose of studying infant's cognitive and linguistic development and interactions with the environment. The proposed platform connects two communities of users: families and academics, with strong innovation potential for each type of users. For families, the platform offers a novel functionality: the ability for parents to follow the development of their child on a daily basis through language and cognitive metrics (growth curves in number of words, verbal complexity, social skills, etc). For academic research, the platform provides a novel means for studying language and cognitive development at an unprecedented scale and level of detail. They will submit algorithms to the secure server which will only output anonymized aggregate statistics. Ultimately, {BabyCloud aims at creating an ecosystem of third parties (public and private research labs...) gravitating around developmental data, entirely controlled by the party whose data originate from, i.e. families.",{"paper_id":7419,"title":7420,"year":81,"month":855,"day":63,"doi":7421,"resource_url":7422,"first_page":63,"last_page":63,"pdf_url":7423,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7424,"paper_type":860,"authors":7425,"abstract":7434},"lrec2018-main-362","Infant Word Comprehension-to-Production Index Applied to Investigation of Noun Learning Predominance Using Cross-lingual CDI database","10.63317\u002F5biecgwgp8np","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-362","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F383.pdf","minami-etal-2018-infant",[7426,7429,7431],{"paper_id":7419,"author_seq":247,"given_name":7427,"surname":7428,"affiliation":63,"orcid":63},"Yasuhiro","Minami",{"paper_id":7419,"author_seq":232,"given_name":7430,"surname":3802,"affiliation":63,"orcid":63},"Tessei",{"paper_id":7419,"author_seq":218,"given_name":7432,"surname":7433,"affiliation":63,"orcid":63},"Yuko","Okumura","This paper defines a measure called the comprehension-to-production (C2P) index to investigate whether nouns have predominance over verbs in children’s word learning that identifies a partial word learning period from comprehension to production. We applied the C2P index to noun predominance using cross-lingual child communicative development inventory databases and confirmed that it indicates noun predominance in word learning, suggesting that the process between a word’s comprehension and its production is a significant factor of predominance in noun learning by children.",{"paper_id":7436,"title":7437,"year":81,"month":855,"day":63,"doi":7438,"resource_url":7439,"first_page":63,"last_page":63,"pdf_url":7440,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7441,"paper_type":860,"authors":7442,"abstract":7449},"lrec2018-main-363","Building a TOCFL Learner Corpus for Chinese Grammatical Error Diagnosis","10.63317\u002F25h4dvn46edt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-363","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F533.pdf","lee-etal-2018-building",[7443,7445,7447],{"paper_id":7436,"author_seq":247,"given_name":7444,"surname":4032,"affiliation":63,"orcid":63},"Lung-Hao",{"paper_id":7436,"author_seq":232,"given_name":7446,"surname":4689,"affiliation":63,"orcid":63},"Yuen-Hsien",{"paper_id":7436,"author_seq":218,"given_name":7448,"surname":2069,"affiliation":63,"orcid":63},"Li-Ping","This study describes the construction of a TOCFL learner corpus and its usage for Chinese grammatical error diagnosis. We collected essays from the Test Of Chinese as a Foreign Language (TOCFL) and annotated grammatical errors using hierarchical tagging sets. Two kinds of error classifications were used simultaneously to tag grammatical errors. The first capital letter of each error tags denotes the coarse-grained surface differences, while the subsequent lowercase letters denote the fine-grained linguistic categories. A total of 33,835 grammatical errors in 2,837 essays and their corresponding corrections were manually annotated. We then used the Standard Generalized Markup Language to format learner texts and annotations along with learners’ accompanying metadata. Parts of the TOCFL learner corpus have been provided for shared tasks on Chinese grammatical error diagnosis. We also investigated systems participating in the shared tasks to better understand current achievements and challenges. The datasets are publicly available to facilitate further research. To our best knowledge, this is the first annotated learner corpus of traditional Chinese, and the entire learner corpus will be publicly released.",{"paper_id":7451,"title":7452,"year":81,"month":855,"day":63,"doi":7453,"resource_url":7454,"first_page":63,"last_page":63,"pdf_url":7455,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7456,"paper_type":860,"authors":7457,"abstract":7460},"lrec2018-main-364","MIAPARLE: Online training for the discrimination of stress contrasts","10.63317\u002F36dotxgsnijm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-364","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F558.pdf","goldman-schwab-2018-miaparle",[7458,7459],{"paper_id":7451,"author_seq":247,"given_name":5185,"surname":5186,"affiliation":63,"orcid":63},{"paper_id":7451,"author_seq":232,"given_name":6270,"surname":3985,"affiliation":63,"orcid":63},"MIAPARLE is a public web application that is designed to offer a range of CAPT (computer-aided pronunciation teaching) tools for L2 learners. Besides helping language learners to reduce their foreign-accentedness, the goal of the platform is to test these tools, gather feedback and improve them according to their educational impact. In this article, we describe one particular training tool that focuses on stress perception. This tool is particularly useful for speakers whose L1 is a fixed-stress language, such as French. These speakers have difficulties perceiving and discriminating stress contrasts. To help them with this so-called stress 'deafness', the methodology used in the training is based on successive questions in which a visual pattern is associated with the sound of a lexical item. After successively completing their pre-tests, training and post-tests, the participants are given their improvement score. The performance of the training is evaluated by comparing the learner’s results at the pre- and post-test stages. Various methodological parameters, such as the number of training items or the number of visual patterns are tested in parallel in order to quantify their teaching efficiency, and to optimise the overall teaching impact.",{"paper_id":7462,"title":7463,"year":81,"month":855,"day":63,"doi":7464,"resource_url":7465,"first_page":63,"last_page":63,"pdf_url":7466,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7467,"paper_type":860,"authors":7468,"abstract":7472},"lrec2018-main-365","ESCRITO - An NLP-Enhanced Educational Scoring Toolkit","10.63317\u002F2ic4g8apsiqr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-365","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F590.pdf","zesch-horbach-2018-escrito",[7469,7470],{"paper_id":7462,"author_seq":247,"given_name":5014,"surname":5015,"affiliation":63,"orcid":63},{"paper_id":7462,"author_seq":232,"given_name":2516,"surname":7471,"affiliation":63,"orcid":63},"Horbach","We propose Escrito, a toolkit for scoring student writings using NLP techniques that addresses two main user groups: teachers and NLP researchers. Teachers can use a high-level API in the teacher mode to assemble scoring pipelines easily.  NLP researchers can use the developer mode to access a low-level API, which not only makes available a number of pre-implemented components, but also allows the user to integrate their own readers, preprocessing components, or feature extractors. In this way, the toolkit provides a ready-made testbed for applying the latest developments from NLP areas like text similarity, paraphrase detection, textual entailment, and argument mining within the highly challenging task of educational scoring and feedback. At the same time, it allows teachers to apply cutting-edge technology in the classroom.",{"paper_id":7474,"title":7475,"year":81,"month":855,"day":63,"doi":7476,"resource_url":7477,"first_page":63,"last_page":63,"pdf_url":7478,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7479,"paper_type":860,"authors":7480,"abstract":7491},"lrec2018-main-366","A Leveled Reading Corpus of Modern Standard Arabic","10.63317\u002F2vxe57mwjr49","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-366","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F619.pdf","al-khalil-etal-2018-leveled",[7481,7484,7487,7488],{"paper_id":7474,"author_seq":247,"given_name":7482,"surname":7483,"affiliation":63,"orcid":63},"Muhamed","Al Khalil",{"paper_id":7474,"author_seq":232,"given_name":7485,"surname":7486,"affiliation":63,"orcid":63},"Hind","Saddiki",{"paper_id":7474,"author_seq":218,"given_name":3646,"surname":3647,"affiliation":63,"orcid":63},{"paper_id":7474,"author_seq":203,"given_name":7489,"surname":7490,"affiliation":63,"orcid":63},"Latifa","Alfalasi","We present a reading corpus in Modern Standard Arabic to enrich the sparse collection of resources that can be leveraged for educational applications. The corpus consists of textbook material from the curriculum of the United Arab Emirates, spanning all 12 grades (1.4 million tokens) and a collection of 129 unabridged works of fiction (5.6 million tokens) all annotated with reading levels from Grade 1 to Post-secondary. We examine reading progression in terms of lexical coverage, and compare the two sub-corpora (curricular, fiction) to others from clearly established genres (news, legal\u002Fdiplomatic) to measure representation of their respective genres.",{"paper_id":7493,"title":7494,"year":81,"month":855,"day":63,"doi":7495,"resource_url":7496,"first_page":63,"last_page":63,"pdf_url":7497,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7498,"paper_type":860,"authors":7499,"abstract":7509},"lrec2018-main-367","Developing New Linguistic Resources and Tools for the Galician Language","10.63317\u002F35btoyxg42fj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-367","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F848.pdf","agerri-etal-2018-developing",[7500,7502,7505,7506],{"paper_id":7493,"author_seq":247,"given_name":1953,"surname":7501,"affiliation":63,"orcid":63},"Agerri",{"paper_id":7493,"author_seq":232,"given_name":7503,"surname":7504,"affiliation":63,"orcid":63},"Xavier","Gómez Guinovart",{"paper_id":7493,"author_seq":218,"given_name":6777,"surname":6778,"affiliation":63,"orcid":63},{"paper_id":7493,"author_seq":203,"given_name":7507,"surname":7508,"affiliation":63,"orcid":63},"Miguel Anxo","Solla Portela","In this paper we describe the work towards developing new resources and Natural Language Processing (NLP) tools for the Galician language. First, a new corpus, manually revised, for POS tagging and lemmatization is described. Second, we present a new manually annotated corpus for Named Entity tagging for Galician. Third, we train and develop new NLP tools for Galician, including the first publicly available Galician statistical modules for lemmatization and Named Entity Recognition, and new modules for POS tagging, Wikification and Named Entity Disambiguation. Finally, we also present two new Web demo applications to easily test the new set of tools online.",{"paper_id":7511,"title":7512,"year":81,"month":855,"day":63,"doi":7513,"resource_url":7514,"first_page":63,"last_page":63,"pdf_url":7515,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7516,"paper_type":860,"authors":7517,"abstract":7531},"lrec2018-main-368","Modeling Northern Haida Verb Morphology","10.63317\u002F2whrzuzomqc3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-368","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F924.pdf","lachler-etal-2018-modeling",[7518,7521,7524,7527,7530],{"paper_id":7511,"author_seq":247,"given_name":7519,"surname":7520,"affiliation":63,"orcid":63},"Jordan","Lachler",{"paper_id":7511,"author_seq":232,"given_name":7522,"surname":7523,"affiliation":63,"orcid":63},"Lene","Antonsen",{"paper_id":7511,"author_seq":218,"given_name":7525,"surname":7526,"affiliation":63,"orcid":63},"Trond","Trosterud",{"paper_id":7511,"author_seq":203,"given_name":7528,"surname":7529,"affiliation":63,"orcid":63},"Sjur","Moshagen",{"paper_id":7511,"author_seq":188,"given_name":6296,"surname":6297,"affiliation":63,"orcid":63},"This paper describes the development of a computational model of the morphology of Northern Haida based on finite state machines (FSMs), with a focus on verbs. Northern Haida is highly endangered, and a member of the isolate Haida macrolanguage, spoken in British Columbia and Alaska. Northern Haida is a highly-inflecting language whose verbal morphology relies largely on suffixes, with a limited number of prefixes. The suffixes trigger morphophonological changes in the stem, participate in blocking, and exhibit variable ordering in certain constructions. The computational model of Northern Haida verb morphology is capable of handling these complex affixation patterns and the morphophonological alternations that they engender. In this paper, we describe the challenges we encountered and the solutions we propose, while contextualizing the endeavour in the description, documentation and revitalization of First Nations Languages in Canada.",{"paper_id":7533,"title":7534,"year":81,"month":855,"day":63,"doi":7535,"resource_url":7536,"first_page":63,"last_page":63,"pdf_url":7537,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7538,"paper_type":860,"authors":7539,"abstract":7550},"lrec2018-main-369","Low-resource Post Processing of Noisy OCR Output for Historical Corpus Digitisation","10.63317\u002F4cyi969wd46p","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-369","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F971.pdf","richter-etal-2018-low",[7540,7543,7545,7547],{"paper_id":7533,"author_seq":247,"given_name":7541,"surname":7542,"affiliation":63,"orcid":63},"Caitlin","Richter",{"paper_id":7533,"author_seq":232,"given_name":1226,"surname":7544,"affiliation":63,"orcid":63},"Wickes",{"paper_id":7533,"author_seq":218,"given_name":6422,"surname":7546,"affiliation":63,"orcid":63},"Beser",{"paper_id":7533,"author_seq":203,"given_name":7548,"surname":7549,"affiliation":63,"orcid":63},"Mitch","Marcus","We develop a post-processing system to efficiently correct errors from noisy optical character recognition (OCR) in a 2.7 million word Faroese corpus. 7.6% of the words in the original OCR text contain an error; fully manual correction would take thousands of hours due to the size of the corpus. Instead, our post-processing method applied to the Faroese corpus is projected to reduce the word error rate to 1.3% with around 65 hours of human annotator work. The foundation for generating corrected text is an HMM that learns patterns of OCR error and decodes noisy OCR character sequences into hypothesised correct language. A dictionary augments the HMM by contributing additional language knowledge, and a human annotator provides judgements in a small subset of cases that are identified as otherwise most prone to inaccurate output. An interactive workstation facilitates quick and accurate input for annotation. The entire toolkit is written in Python and is being made available for use in other low-resource languages where standard OCR technology falls short of desirable text quality. Supplementary analyses explore the impact of variable language resource availability and annotator time limitations on the end quality achievable with our toolkit.",{"paper_id":7552,"title":7553,"year":81,"month":855,"day":63,"doi":7554,"resource_url":7555,"first_page":63,"last_page":63,"pdf_url":7556,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7557,"paper_type":860,"authors":7558,"abstract":7575},"lrec2018-main-370","Introducing the CLARIN Knowledge Centre for Linguistic Diversity and Language Documentation","10.63317\u002F4du9mfu2sjhf","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-370","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F995.pdf","hedeland-etal-2018-introducing",[7559,7562,7565,7567,7570,7573],{"paper_id":7552,"author_seq":247,"given_name":7560,"surname":7561,"affiliation":63,"orcid":63},"Hanna","Hedeland",{"paper_id":7552,"author_seq":232,"given_name":7563,"surname":7564,"affiliation":63,"orcid":63},"Timm","Lehmberg",{"paper_id":7552,"author_seq":218,"given_name":1211,"surname":7566,"affiliation":63,"orcid":63},"Rau",{"paper_id":7552,"author_seq":203,"given_name":7568,"surname":7569,"affiliation":63,"orcid":63},"Sophie","Salffner",{"paper_id":7552,"author_seq":188,"given_name":7571,"surname":7572,"affiliation":63,"orcid":63},"Mandana","Seyfeddinipur",{"paper_id":7552,"author_seq":172,"given_name":3238,"surname":7574,"affiliation":63,"orcid":63},"Witt","The European digital research infrastructure CLARIN (Common Language Resources and Technology Infrastructure) is building a Knowledge Sharing Infrastructure (KSI) to ensure that existing knowledge and expertise is easily available both for the CLARIN community and for the humanities research communities for which CLARIN is being developed. Within the Knowledge Sharing Infrastructure, so called Knowledge Centres comprise one or more physical institutions with particular expertise in certain areas and are committed to providing their expertise in the form of reliable knowledge-sharing services. In this paper, we present the ninth K Centre – the CLARIN Knowledge Centre for Linguistic Diversity and Language Documentation (CKLD) – and the expertise and services provided by the member institutions at the Universities of London (ELAR\u002FSWLI), Cologne (DCH\u002FIfDH\u002FIfL) and Hamburg (HZSK\u002FINEL). The centre offers information on current best practices, available resources and tools, and gives advice on technological and methodological matters for researchers working within relevant fields.",{"paper_id":7577,"title":7578,"year":81,"month":855,"day":63,"doi":7579,"resource_url":7580,"first_page":63,"last_page":63,"pdf_url":7581,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7582,"paper_type":860,"authors":7583,"abstract":7589},"lrec2018-main-371","Low Resource Methods for Medieval Document Sections Analysis","10.63317\u002F38zvjuae4ihk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-371","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F999.pdf","galuscakova-neuzilova-2018-low",[7584,7587],{"paper_id":7577,"author_seq":247,"given_name":7585,"surname":7586,"affiliation":63,"orcid":63},"Petra","Galuščáková",{"paper_id":7577,"author_seq":232,"given_name":6912,"surname":7588,"affiliation":63,"orcid":63},"Neužilová","This paper describes a small but unique digitized collection of medieval Latin charters. This collection consists of 57 charters of 7 types illustrating various purposes of issuance by the Royal Chancellery. Sections in these documents were manually annotated for deeper analysis of the structure of issued charters. This paper also describes two baseline methods for an automatic and semi-automatic analysis and detection of sections of diplomatic documents. The first method is based on an information retrieval paradigm, and the second one is an adaptation of Hidden Markov Models. Both methods were proposed to work with respect to a small amount of available train data. Even though these methods were specifically proposed to work with medieval Latin charters, they can be applied to any documents with partially repetitive character.",{"paper_id":7591,"title":7592,"year":81,"month":855,"day":63,"doi":7593,"resource_url":7594,"first_page":63,"last_page":63,"pdf_url":7595,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7596,"paper_type":860,"authors":7597,"abstract":7611},"lrec2018-main-372","SB-CH: A Swiss German Corpus with Sentiment Annotations","10.63317\u002F59z46sa6988z","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-372","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1010.pdf","grubenmann-etal-2018-sb",[7598,7601,7604,7607,7609],{"paper_id":7591,"author_seq":247,"given_name":7599,"surname":7600,"affiliation":63,"orcid":63},"Ralf","Grubenmann",{"paper_id":7591,"author_seq":232,"given_name":7602,"surname":7603,"affiliation":63,"orcid":63},"Don","Tuggener",{"paper_id":7591,"author_seq":218,"given_name":7605,"surname":7606,"affiliation":63,"orcid":63},"Pius","von Däniken",{"paper_id":7591,"author_seq":203,"given_name":2633,"surname":7608,"affiliation":63,"orcid":63},"Deriu",{"paper_id":7591,"author_seq":188,"given_name":1364,"surname":7610,"affiliation":63,"orcid":63},"Cieliebak","We present the SB-CH corpus, a novel Swiss German corpus with annotations for sentiment analysis. It consists of more than 200,000 phrases (approx. 1 Mio tokens) from Facebook comments and online chats. Additionally, we provide sentiment annotations for almost 2000 Swiss German phrases. We describe the methodologies used in the collection and annotation of the data, and provide the first baseline results for Swiss German sentiment analysis.",{"paper_id":7613,"title":7614,"year":81,"month":855,"day":63,"doi":7615,"resource_url":7616,"first_page":63,"last_page":63,"pdf_url":7617,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7618,"paper_type":860,"authors":7619,"abstract":7625},"lrec2018-main-373","Universal Dependencies for Ainu","10.63317\u002F4m8c3pit8ewp","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-373","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1034.pdf","senuma-aizawa-2018-universal",[7620,7622],{"paper_id":7613,"author_seq":247,"given_name":3101,"surname":7621,"affiliation":63,"orcid":63},"Senuma",{"paper_id":7613,"author_seq":232,"given_name":7623,"surname":7624,"affiliation":63,"orcid":63},"Akiko","Aizawa","This paper reports an on-going effort to create a dependency tree bank for the Ainu language in the scheme of Universal Dependencies (UD). The task is crucial both language-internally (language revitalization) and language-externally (providing sources for new features and insights to UD). Since the language shows many of the representative phenomena of a type of languages called polysynthetic languages, an annotation schema to Ainu can be used as a basis to extend the current specification of UD. Our language resource comprises an annotation guideline, dependency bank based on UD, and a mini-lexicon. Although the size of the dependency bank will be small and contain only around 10,000 word tokens, it can serve as a base annotation for the next step. Our mini-lexicon is encoded under the W3C OntoLex specification with UD and UniMorph (UM) features with the system-friendly JSON-LD format and thus bearable to future extensions. We also provide a brief description of dependency relations and local features used in the bank such as pronominal cross-indexing and alienability.",{"paper_id":7627,"title":7628,"year":81,"month":855,"day":63,"doi":7629,"resource_url":7630,"first_page":63,"last_page":63,"pdf_url":7631,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7632,"paper_type":860,"authors":7633,"abstract":7653},"lrec2018-main-374","Signbank: Software to Support Web Based Dictionaries of Sign Language","10.63317\u002F39xy2jfsp733","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-374","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F499.pdf","cassidy-etal-2018-signbank",[7634,7635,7638,7641,7644,7647,7649,7650],{"paper_id":7627,"author_seq":247,"given_name":1737,"surname":1202,"affiliation":63,"orcid":63},{"paper_id":7627,"author_seq":232,"given_name":7636,"surname":7637,"affiliation":63,"orcid":63},"Onno","Crasborn",{"paper_id":7627,"author_seq":218,"given_name":7639,"surname":7640,"affiliation":63,"orcid":63},"Henri","Nieminen",{"paper_id":7627,"author_seq":203,"given_name":7642,"surname":7643,"affiliation":63,"orcid":63},"Wessel","Stoop",{"paper_id":7627,"author_seq":188,"given_name":7645,"surname":7646,"affiliation":63,"orcid":63},"Micha","Hulsbosch",{"paper_id":7627,"author_seq":172,"given_name":1232,"surname":7648,"affiliation":63,"orcid":63},"Even",{"paper_id":7627,"author_seq":155,"given_name":4617,"surname":4618,"affiliation":63,"orcid":63},{"paper_id":7627,"author_seq":138,"given_name":7651,"surname":7652,"affiliation":63,"orcid":63},"Trevor","Johnston","Signbank is a web application that was originally built to support the Auslan Signbank on-line web dictionary, it was an Open Source re-implementation of an earlier version of that site. The application provides a framework for the development of a rich lexical database of sign language augmented with video samples of signs. As an Open Source project, the original Signbank has formed the basis of a number of new sign language dictionaries and corpora including those for British Sign Language, Sign Language of the Netherlands and Finnish Sign Language. Versions are under development for American Sign Language and Flemish Sign Language. This paper describes the overall architecture of the Signbank system and its representation of lexical entries and associated entities.",{"paper_id":7655,"title":7656,"year":81,"month":855,"day":63,"doi":7657,"resource_url":7658,"first_page":63,"last_page":63,"pdf_url":7659,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7660,"paper_type":860,"authors":7661,"abstract":7678},"lrec2018-main-375","J-MeDic: A Japanese Disease Name Dictionary based on Real Clinical Usage","10.63317\u002F32oadgyeqrya","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-375","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F505.pdf","ito-etal-2018-j",[7662,7664,7666,7669,7672,7675],{"paper_id":7655,"author_seq":247,"given_name":7663,"surname":6971,"affiliation":63,"orcid":63},"Kaoru",{"paper_id":7655,"author_seq":232,"given_name":3651,"surname":7665,"affiliation":63,"orcid":63},"Nagai",{"paper_id":7655,"author_seq":218,"given_name":7667,"surname":7668,"affiliation":63,"orcid":63},"Taro","Okahisa",{"paper_id":7655,"author_seq":203,"given_name":7670,"surname":7671,"affiliation":63,"orcid":63},"Shoko","Wakamiya",{"paper_id":7655,"author_seq":188,"given_name":7673,"surname":7674,"affiliation":63,"orcid":63},"Tomohide","Iwao",{"paper_id":7655,"author_seq":172,"given_name":7676,"surname":7677,"affiliation":63,"orcid":63},"Eiji","Aramaki","Medical texts such as electronic health records are necessary for medical AI development. Nevertheless, it is difficult to use data directly because medical texts are written mostly in natural language, requiring natural language processing (NLP) for medical texts. To boost the fundamental accuracy of Medical NLP, a high coverage dictionary is required, especially one that fills the gap separating standard medical names and real clinical words. This study developed a Japanese disease name dictionary called “J-MeDic” to fill this gap. The names that comprise the dictionary were collected from approximately 45,000 manually annotated real clinical case reports. We allocated the standard disease code (ICD-10) to them with manual, semi-automatic, or automatic methods, in accordance with its frequency. The J-MeDic covers 7,683 concepts (in ICD-10) and 51,784 written forms. Among the names covered by J-MeDic, 55.3% (6,391\u002F11,562) were covered by SDNs; 44.7% (5,171\u002F11,562) were covered by names added from the CR corpus. Among them, 8.4% (436\u002F5,171) were basically coded by humans), and 91.6% (4,735\u002F5,171) were basically coded automatically. We investigated the coverage of this resource using discharge summaries from a hospital; 66.2% of the names are matched with the entries, revealing the practical feasibility of our dictionary.",{"paper_id":7680,"title":7681,"year":81,"month":855,"day":63,"doi":7682,"resource_url":7683,"first_page":63,"last_page":63,"pdf_url":7684,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7685,"paper_type":860,"authors":7686,"abstract":7692},"lrec2018-main-376","Building a List of Synonymous Words and Phrases of Japanese Compound Verbs","10.63317\u002F33vjvbktqdp7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-376","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F532.pdf","kanzaki-isahara-2018-building",[7687,7689],{"paper_id":7680,"author_seq":247,"given_name":1638,"surname":7688,"affiliation":63,"orcid":63},"Kanzaki",{"paper_id":7680,"author_seq":232,"given_name":7690,"surname":7691,"affiliation":63,"orcid":63},"Hitoshi","Isahara","We started to construct a database of synonymous expressions of Japanese “Verb + Verb” compounds semi-automatically. Japanese is known to be rich in compound verbs consisting of two verbs joined together. However, we did not have a comprehensive Japanese compound lexicon. Recently a Japanese compound verb lexicon was constructed by the National Institute for Japanese Language and Linguistics(NINJAL)(2013-15). Though it has meanings, example sentences, syntactic patterns and actual sentences from the corpus that they possess, it has no information on relationships with another words, such as synonymous words and phrases. We automatically extracted synonymous expressions of compound verbs from corpus which is “five hundred million Japanese texts gathered from the web” produced by Kawahara et.al. (2006) by using word2vec and cosine similarity and find suitable clusters which correspond to meanings of the compound verbs by using k-means++ and PCA. The automatic extraction from corpus helps humans find not only typical synonyms but also unexpected synonymous words and phrases. Then we manually compile the list of synonymous expressions of Japanese compound verbs by assessing the result and also link it to the “Compound Verb Lexicon” published by NINJAL.",{"paper_id":7694,"title":7695,"year":81,"month":855,"day":63,"doi":7696,"resource_url":7697,"first_page":63,"last_page":63,"pdf_url":7698,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7699,"paper_type":860,"authors":7700,"abstract":7703},"lrec2018-main-377","Evaluating EcoLexiCAT: a Terminology-Enhanced CAT Tool","10.63317\u002F28ni8yez3sbz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-377","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F536.pdf","leon-arauz-reimerink-2018-evaluating",[7701,7702],{"paper_id":7694,"author_seq":247,"given_name":4053,"surname":4054,"affiliation":63,"orcid":63},{"paper_id":7694,"author_seq":232,"given_name":4050,"surname":4051,"affiliation":63,"orcid":63},"EcoLexiCAT is a web-based tool for the terminology-enhanced translation of specialized environmental texts for the language combination English-Spanish-English. It uses the open source version of the web-based CAT tool MateCat and enriches a source text with information from: (1) EcoLexicon, a multimodal and multilingual terminological knowledge base on the environment (Faber et al., 2014; Faber et al., 2016); (2) BabelNet, an automatically constructed multilingual encyclopedic dictionary and semantic network (Navigli & Ponzetto, 2012); (3) Sketch Engine, the well-known corpus query system (Kilgarriff et al., 2004); (4) IATE, the multilingual glossary of the European Commission; and (4) other external resources (i.e. Wikipedia, Collins, Wordreference, Linguee, etc.) that can also be customized by the user. The tool was built with the aim of integrating terminology management – often considered complex and time-consuming – in the translation workflow of a CAT tool. In this paper, EcoLexiCAT is described along the procedure with which it was evaluated and the results of the evaluation.",{"paper_id":7705,"title":7706,"year":81,"month":855,"day":63,"doi":7707,"resource_url":7708,"first_page":63,"last_page":63,"pdf_url":7709,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7710,"paper_type":860,"authors":7711,"abstract":7726},"lrec2018-main-378","A Danish FrameNet Lexicon and an Annotated Corpus Used for Training and Evaluating a Semantic Frame Classifier","10.63317\u002F5mv4m7fx49ec","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-378","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F586.pdf","pedersen-etal-2018-danish",[7712,7715,7718,7721,7723],{"paper_id":7705,"author_seq":247,"given_name":7713,"surname":7714,"affiliation":63,"orcid":63},"Bolette","Pedersen",{"paper_id":7705,"author_seq":232,"given_name":7716,"surname":7717,"affiliation":63,"orcid":63},"Sanni","Nimb",{"paper_id":7705,"author_seq":218,"given_name":7719,"surname":7720,"affiliation":63,"orcid":63},"Anders","Søgaard",{"paper_id":7705,"author_seq":203,"given_name":7722,"surname":5597,"affiliation":63,"orcid":63},"Mareike",{"paper_id":7705,"author_seq":188,"given_name":7724,"surname":7725,"affiliation":63,"orcid":63},"Sussi","Olsen","In this paper, we present an approach to efficiently compile a Danish FrameNet based on the Danish Thesaurus, focusing in particular on cognition and communication frames. The Danish FrameNet uses the frame and role inventory of the English FrameNet. We present the corresponding corpus annotations of frames and roles and show how our corpus can be used for training and evaluating a semantic frame classifier for cognition and communication frames. We also present results of cross-language transfer of a model trained on the English FrameNet. Our approach is significantly faster than building a lexicon from scratch, and we show that it is feasible to annotate Danish with frames developed for English, and finally, that frame annotations – even if limited in size at the current stage – are useful for automatic frame classification.",{"paper_id":7728,"title":7729,"year":81,"month":855,"day":63,"doi":7730,"resource_url":7731,"first_page":63,"last_page":63,"pdf_url":7732,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7733,"paper_type":860,"authors":7734,"abstract":7744},"lrec2018-main-379","SLIDE - a Sentiment Lexicon of Common Idioms","10.63317\u002F3i9iizbhe2gj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-379","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F602.pdf","jochim-etal-2018-slide",[7735,7738,7741,7743],{"paper_id":7728,"author_seq":247,"given_name":7736,"surname":7737,"affiliation":63,"orcid":63},"Charles","Jochim",{"paper_id":7728,"author_seq":232,"given_name":7739,"surname":7740,"affiliation":63,"orcid":63},"Francesca","Bonin",{"paper_id":7728,"author_seq":218,"given_name":2584,"surname":7742,"affiliation":63,"orcid":63},"Bar-Haim",{"paper_id":7728,"author_seq":203,"given_name":1626,"surname":1627,"affiliation":63,"orcid":63},"Idiomatic expressions are problematic for most sentiment analysis approaches, which rely on words as the basic linguistic unit. Compositional solutions for phrase sentiment are not able to handle idioms correctly because their sentiment is not derived from the sentiment of the individual words. Previous work has explored the importance of idioms for sentiment analysis, but has not addressed the breadth of idiomatic expressions in English. In this paper we present an approach for collecting sentiment annotation of idiomatic multiword expressions using crowdsourcing. We collect 10 annotations for each idiom and the aggregated label is shown to have good agreement with expert annotations. We describe the resulting publicly available lexicon and how it captures sentiment strength and ambiguity. The Sentiment Lexicon of IDiomatic Expressions (SLIDE) is much larger than previous idiom lexicons. The lexicon includes 5,000 frequently occurring idioms, as estimated from a large English corpus. The idioms were selected from Wiktionary, and over 40% of them were labeled as sentiment-bearing.",{"paper_id":7746,"title":7747,"year":81,"month":855,"day":63,"doi":7748,"resource_url":7749,"first_page":63,"last_page":63,"pdf_url":7750,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7751,"paper_type":860,"authors":7752,"abstract":7762},"lrec2018-main-380","PronouncUR: An Urdu Pronunciation Lexicon Generator","10.63317\u002F3vi8og6nd9cp","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-380","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F646.pdf","bin-zia-etal-2018-pronouncur",[7753,7756,7759],{"paper_id":7746,"author_seq":247,"given_name":7754,"surname":7755,"affiliation":63,"orcid":63},"Haris","Bin Zia",{"paper_id":7746,"author_seq":232,"given_name":7757,"surname":7758,"affiliation":63,"orcid":63},"Agha Ali","Raza",{"paper_id":7746,"author_seq":218,"given_name":7760,"surname":7761,"affiliation":63,"orcid":63},"Awais","Athar","State-of-the-art speech recognition systems rely heavily on three basic components: an acoustic model, a pronunciation lexicon and a language model. To build these components, a researcher needs linguistic as well as technical expertise, which is a barrier in low-resource domains. Techniques to construct these three components without having expert domain knowledge are in great demand. Urdu, despite having millions of speakers all over the world, is a low-resource language in terms of standard publically available linguistic resources. In this paper, we present a grapheme-to-phoneme conversion tool for Urdu that generates a pronunciation lexicon in a form suitable for use with speech recognition systems from a list of Urdu words. The tool predicts the pronunciation of words using a LSTM-based model trained on a handcrafted expert lexicon of around 39,000 words and shows an accuracy of 64% upon internal evaluation. For external evaluation on a speech recognition task, we obtain a word error rate comparable to one achieved using a fully handcrafted expert lexicon.",{"paper_id":7764,"title":7765,"year":81,"month":855,"day":63,"doi":7766,"resource_url":7767,"first_page":63,"last_page":63,"pdf_url":7768,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7769,"paper_type":860,"authors":7770,"abstract":7778},"lrec2018-main-381","SimLex-999 for Polish","10.63317\u002F4juk39vn6uiu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-381","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F687.pdf","mykowiecka-etal-2018-simlex",[7771,7773,7776],{"paper_id":7764,"author_seq":247,"given_name":7186,"surname":7772,"affiliation":63,"orcid":63},"Mykowiecka",{"paper_id":7764,"author_seq":232,"given_name":7774,"surname":7775,"affiliation":63,"orcid":63},"Małgorzata","Marciniak",{"paper_id":7764,"author_seq":218,"given_name":996,"surname":7777,"affiliation":63,"orcid":63},"Rychlik","The paper addresses the Polish version of SimLex-999 which we extended to contain not only measurement of similarity but also relatedness. The data was translated by three independent linguists; discrepancies in translation were resolved by a fourth person. The agreement rates between the translators were counted and an analysis of problems was performed. Then, pairs of words were rated by other annotators on a scale of 0--10 for similarity and relatedness of words. Finally, we compared the human annotations with the distributional semantics models  of Polish based on lemmas and forms. We compared our work with the results reported for other languages.",{"paper_id":7780,"title":7781,"year":81,"month":855,"day":63,"doi":7782,"resource_url":7783,"first_page":63,"last_page":63,"pdf_url":7784,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7785,"paper_type":860,"authors":7786,"abstract":7789},"lrec2018-main-382","Finely Tuned, 2 Billion Token Based Word Embeddings for Portuguese","10.63317\u002F2cq75r4kvsdx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-382","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F592.pdf","rodrigues-branco-2018-finely",[7787,7788],{"paper_id":7780,"author_seq":247,"given_name":3884,"surname":4507,"affiliation":63,"orcid":63},{"paper_id":7780,"author_seq":232,"given_name":1332,"surname":1333,"affiliation":63,"orcid":63},"A distributional semantics model --- also known as word embeddings --- is a major asset for any language as the research results reported in the literature have consistently shown that it is instrumental to improve the performance of a wide range of applications and processing tasks for that language.  In this paper, we describe the development of an advanced distributional model for Portuguese, with the largest vocabulary and the best evaluation scores published so far. This model was made possible by resorting to new languages resources we recently developed: to a much larger training corpus than before and to a more sophisticated evaluation supported by new and more fine-grained evaluation tasks and data sets. We also indicate how the new language resource reported on here is being distributed and where it can be obtained for free under a most permissive license.",{"paper_id":7791,"title":7792,"year":81,"month":855,"day":63,"doi":7793,"resource_url":7794,"first_page":63,"last_page":63,"pdf_url":7795,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7796,"paper_type":860,"authors":7797,"abstract":7803},"lrec2018-main-383","Teanga: A Linked Data based platform for Natural Language Processing","10.63317\u002F5frhtboej2xt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-383","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F106.pdf","ziad-etal-2018-teanga",[7798,7801,7802],{"paper_id":7791,"author_seq":247,"given_name":7799,"surname":7800,"affiliation":63,"orcid":63},"Housam","Ziad",{"paper_id":7791,"author_seq":232,"given_name":3687,"surname":3688,"affiliation":63,"orcid":63},{"paper_id":7791,"author_seq":218,"given_name":3690,"surname":3691,"affiliation":63,"orcid":63},"In this paper, we describe Teanga, a linked data based platform for natural language processing (NLP). Teanga enables the use of many NLP services from a single interface, whether the need was to use a single service or multiple services in a pipeline. Teanga focuses on the problem of NLP services interoperability by using linked data to define the types of services input and output. Teanga’s strengths include being easy to install and run, easy to use, able to run multiple NLP tasks from one interface and helping users to build a pipeline of tasks through a graphical user interface.",{"paper_id":7805,"title":7806,"year":81,"month":855,"day":63,"doi":7807,"resource_url":7808,"first_page":63,"last_page":63,"pdf_url":7809,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7810,"paper_type":860,"authors":7811,"abstract":7819},"lrec2018-main-384","Automatic and Manual Web Annotations in an Infrastructure to handle Fake News and other Online Media Phenomena","10.63317\u002F34zoh52s4bdi","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-384","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F380.pdf","rehm-etal-2018-automatic",[7812,7815,7817],{"paper_id":7805,"author_seq":247,"given_name":7813,"surname":7814,"affiliation":63,"orcid":63},"Georg","Rehm",{"paper_id":7805,"author_seq":232,"given_name":3077,"surname":7816,"affiliation":63,"orcid":63},"Moreno-Schneider",{"paper_id":7805,"author_seq":218,"given_name":1474,"surname":7818,"affiliation":63,"orcid":63},"Bourgonje","Online media are ubiquitous and consumed by billions of people globally. Recently, however, several phenomena regarding online media have emerged that pose a severe threat to media consumption and reception as well as to the potential of manipulating opinions and, thus, (re)actions, on a large scale. Lumped together under the label “fake news”, these phenomena comprise, among others, maliciously manipulated content, bad journalism, parodies, satire, propaganda and several other types of false news; related phenomena are the often cited filter bubble (echo chamber) effect and the amount of abusive language used online. In an earlier paper we describe an architectural and technological approach to empower users to handle these online media phenomena. In this article we provide the first approach of a metadata scheme to enable, eventually, the standardised annotation of these phenomena in online media. We also show an initial version of a tool that enables the creation, visualisation and exploitation of such annotations.",{"paper_id":7821,"title":7822,"year":81,"month":855,"day":63,"doi":7823,"resource_url":7824,"first_page":63,"last_page":63,"pdf_url":7825,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7826,"paper_type":860,"authors":7827,"abstract":7834},"lrec2018-main-385","The LODeXporter: Flexible Generation of Linked Open Data Triples from NLP Frameworks for Automatic Knowledge Base Construction","10.63317\u002F2ns28esosyby","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-385","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F773.pdf","witte-sateli-2018-lodexporter",[7828,7831],{"paper_id":7821,"author_seq":247,"given_name":7829,"surname":7830,"affiliation":63,"orcid":63},"René","Witte",{"paper_id":7821,"author_seq":232,"given_name":7832,"surname":7833,"affiliation":63,"orcid":63},"Bahar","Sateli","We present LODeXporter, a novel approach for exporting Natural Language Processing (NLP) results to a graph-based knowledge base, following Linked Open Data (LOD) principles.        The rules for transforming NLP entities into Resource Description Framework (RDF) triples are described in a custom mapping language, which is defined in RDF Schema (RDFS) itself, providing a separation of concerns between NLP pipeline engineering and knowledge base engineering. LODeXporter is available as an open source component for the GATE (General Architecture for Text Engineering) framework.",{"paper_id":7836,"title":7837,"year":81,"month":855,"day":63,"doi":7838,"resource_url":7839,"first_page":63,"last_page":63,"pdf_url":7840,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7841,"paper_type":860,"authors":7842,"abstract":7857},"lrec2018-main-386","LiDo RDF: From a Relational Database to a Linked Data Graph of Linguistic Terms and Bibliographic Data","10.63317\u002F2i8nmvm5qs6k","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-386","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F812.pdf","klimek-etal-2018-lido",[7843,7846,7848,7851,7854],{"paper_id":7836,"author_seq":247,"given_name":7844,"surname":7845,"affiliation":63,"orcid":63},"Bettina","Klimek",{"paper_id":7836,"author_seq":232,"given_name":3376,"surname":7847,"affiliation":63,"orcid":63},"Schädlich",{"paper_id":7836,"author_seq":218,"given_name":7849,"surname":7850,"affiliation":63,"orcid":63},"Dustin","Kröger",{"paper_id":7836,"author_seq":203,"given_name":7852,"surname":7853,"affiliation":63,"orcid":63},"Edwin","Knese",{"paper_id":7836,"author_seq":188,"given_name":7855,"surname":7856,"affiliation":63,"orcid":63},"Benedikt","Elßmann","Forty years ago the linguist Dr. Christian Lehmann developed a framework for documenting linguistic terms, concepts and bibliographic data that resulted in the LiDo Terminological and Bibliographical Database (LiDo TBD). Since 2006 students and linguistic researchers benefit from the data by looking it up on the Web. Even though, the LiDo TBD is implemented as a relational database, its underlying framework aims at yielding a terminological network containing data nodes that are connected via specific relation edges in order to create an interrelated data graph. Now, with the emergence of Semantic Web technologies we were able to implement this pioneering work by converting the LiDo TBD relational database into a Linked Data graph. In this paper we present and describe the creation of the LiDo RDF dataset and introduce the LiDo RDF project. The goals of this project are to enable the direct use and reuse of the data both for the scientific research community and machine processing alike as well as to enable a valuable enrichment of already existing linguistic terminological and bibliographic data by including LiDo RDF in the LLOD cloud.",{"paper_id":7859,"title":7860,"year":81,"month":855,"day":63,"doi":7861,"resource_url":7862,"first_page":63,"last_page":63,"pdf_url":7863,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7864,"paper_type":860,"authors":7865,"abstract":7877},"lrec2018-main-387","Towards a Linked Open Data Edition of Sumerian Corpora","10.63317\u002F493ptbrava6k","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-387","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F862.pdf","chiarcos-etal-2018-towards",[7866,7867,7870,7873,7874],{"paper_id":7859,"author_seq":247,"given_name":904,"surname":2684,"affiliation":63,"orcid":63},{"paper_id":7859,"author_seq":232,"given_name":7868,"surname":7869,"affiliation":63,"orcid":63},"Émilie","Pagé-Perron",{"paper_id":7859,"author_seq":218,"given_name":7871,"surname":7872,"affiliation":63,"orcid":63},"Ilya","Khait",{"paper_id":7859,"author_seq":203,"given_name":2504,"surname":2505,"affiliation":63,"orcid":63},{"paper_id":7859,"author_seq":188,"given_name":7875,"surname":7876,"affiliation":63,"orcid":63},"Lucas","Reckling","Linguistic Linked Open Data (LLOD) is a flourishing line of research in the language resource community, so far mostly adopted for selected aspects of linguistics, natural language processing and the semantic web, as well as for practical applications in localization and lexicography. Yet, computational philology seems to be somewhat decoupled from the recent progress in this area: even though LOD as a concept is gaining significant popularity in Digital Humanities, existing LLOD standards and vocabularies are not widely used in this community, and philological resources are underrepresented in the LLOD cloud diagram (http:\u002F\u002Flinguistic-lod.org\u002Fllod-cloud).   In this paper, we present an application of Linguistic Linked Open Data in Assyriology. We describe the LLOD edition of a linguistically annotated corpus of Sumerian, as well as its linking with lexical resources, repositories of annotation terminology, and the museum collections in which the artifacts bearing these texts are kept. The chosen corpus is the Electronic Text Corpus of Sumerian Royal Inscriptions, a well curated and linguistically annotated archive of Sumerian text, in preparation for the creating and linking of other corpora of cuneiform texts, such as the corpus of Ur III administrative and legal Sumerian texts, as part of the Machine Translation and Automated Analysis of Cuneiform Languages project(https:\u002F\u002Fcdli-gh.github.io\u002Fmtaac\u002F).",{"paper_id":7879,"title":7880,"year":81,"month":855,"day":63,"doi":7881,"resource_url":7882,"first_page":63,"last_page":63,"pdf_url":7883,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7884,"paper_type":860,"authors":7885,"abstract":7889},"lrec2018-main-388","A Bird’s-eye View of Language Processing Projects at the Romanian Academy","10.63317\u002F52iu9mr8ye37","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-388","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F391.pdf","tufis-cristea-2018-birds",[7886,7887],{"paper_id":7879,"author_seq":247,"given_name":2383,"surname":4359,"affiliation":63,"orcid":63},{"paper_id":7879,"author_seq":232,"given_name":2383,"surname":7888,"affiliation":63,"orcid":63},"Cristea","This article gives a general overview of five AI language-related projects that address contemporary Romanian language, in both textual and speech form, language related applications, as well as collections of old historic documents and medical archives. Namely, these projects deal with: the creation of a contemporary Romanian language text and speech corpus, resources and technologies for developing human-machine interfaces in spoken Romanian, digitization and transcription of old Romanian language documents drafted in Cyrillic into the modern Latin alphabet, digitization of the oldest archive of diabetes medical records and dialogue systems with personal robots and autonomous vehicles. The technologies involved for attaining the objectives range from image processing (intelligent character recognition for hand-writing and old Romanian documents) to natural language and speech processing techniques (corpus compiling and documentation, multi-level processing, transliteration of different old scripts into modern Romanian, command language processing, various levels of speech-text alignments, ASR, TTS, keyword spotting, etc.). Some of these projects are approaching the end, others have just started and others are about to start. All the reported projects are national ones, less documented than the international projects we are\u002Fwere engaged in, and involve large teams of experts and master\u002FPhD students from computer science, mathematics, linguistics, philology and library sciences.",{"paper_id":7891,"title":7892,"year":81,"month":855,"day":63,"doi":7893,"resource_url":7894,"first_page":63,"last_page":63,"pdf_url":7895,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7896,"paper_type":860,"authors":7897,"abstract":7909},"lrec2018-main-389","PMKI: an European Commission action for the interoperability, maintainability and sustainability of Language Resources","10.63317\u002F562meutxopc6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-389","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F537.pdf","schmitz-etal-2018-pmki",[7898,7900,7903,7906],{"paper_id":7891,"author_seq":247,"given_name":1474,"surname":7899,"affiliation":63,"orcid":63},"Schmitz",{"paper_id":7891,"author_seq":232,"given_name":7901,"surname":7902,"affiliation":63,"orcid":63},"Enrico","Francesconi",{"paper_id":7891,"author_seq":218,"given_name":7904,"surname":7905,"affiliation":63,"orcid":63},"Najeh","Hajlaoui",{"paper_id":7891,"author_seq":203,"given_name":7907,"surname":7908,"affiliation":63,"orcid":63},"Brahim","Batouche","The paper presents the Public Multilingual Knowledge Management Infrastructure (PMKI) action launched by the European Commission (EC) to promote the Digital Single Market in the European Union (EU). PMKI aims to share maintainable and sustainable Language Resources making them interoperable in order to support language technology industry, and public administrations, with multilingual tools able to improve cross border accessibility of digital services. The paper focuses on the main feature (interoperability) that represents the specificity of PMKI platform distinguishing it from other existing frameworks. In particular it aims to create a set of tools and facilities, based on Semantic Web technologies, to establish semantic interoperability between multilingual lexicons. Such task requires to harmonize in general multilingual language resources using standardised representation with respect to a defined core data model under an adequate architecture. A comparative study among the main data models for representing lexicons and recommendations for the PMKI service was required as well. Moreover, synergies with other programs of the EU institutions, as far as systems interoperability and Machine Translation (MT) solutions, are foreseen. For instance some interactions are foreseen between PMKI and MT service provided by the EC but also with other NLP applications.",{"paper_id":7911,"title":7912,"year":81,"month":855,"day":63,"doi":7913,"resource_url":7914,"first_page":63,"last_page":63,"pdf_url":7915,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7916,"paper_type":860,"authors":7917,"abstract":7920},"lrec2018-main-390","The Abkhaz National Corpus","10.63317\u002F5mjaixk82n7m","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-390","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F548.pdf","meurer-2018-abkhaz",[7918],{"paper_id":7911,"author_seq":247,"given_name":3690,"surname":7919,"affiliation":63,"orcid":63},"Meurer","In this paper, we present the Abkhaz National Corpus, a comprehensive and open, grammatically annotated text corpus which makes the Abkhaz language accessible to scientific investigations from various perspectives (linguistics, literary studies, history, political and social sciences etc.). The corpus also serves as a means for the long-term preservation of Abkhaz language documents in digital form, and as a pedagogical tool for language learning. It now comprises more than 10 million words and is continuously being extended. Abkhaz is a lesser-resourced language; prior to this work virtually no computational resources for the language were available. As a member of the West-Caucasian language family, which is characterized by an extremely rich, polysynthetic morphological structure, Abkhaz poses serious challenges to morphosyntactic analysis, the main problem being the high degree of morphological ambiguity. We show how these challenges can be met, and what we plan to further enhance the performance of the analyser.",{"paper_id":7922,"title":7923,"year":81,"month":855,"day":63,"doi":7924,"resource_url":7925,"first_page":63,"last_page":63,"pdf_url":7926,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7927,"paper_type":860,"authors":7928,"abstract":7935},"lrec2018-main-391","Collecting Language Resources from Public Administrations in the Nordic and Baltic Countries","10.63317\u002F2aqzc8y636j8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-391","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1086.pdf","vasiljevs-etal-2018-collecting",[7929,7930,7931,7932],{"paper_id":7922,"author_seq":247,"given_name":4819,"surname":4820,"affiliation":63,"orcid":63},{"paper_id":7922,"author_seq":232,"given_name":4843,"surname":4844,"affiliation":63,"orcid":63},{"paper_id":7922,"author_seq":218,"given_name":1701,"surname":4846,"affiliation":63,"orcid":63},{"paper_id":7922,"author_seq":203,"given_name":7933,"surname":7934,"affiliation":63,"orcid":63},"Aivars","Bērziņš","This paper presents Tilde’s work on collecting language resources from government institutions and other public administrations in the Nordic and Baltic countries. We introduce the activities and results of the European Language Resources Coordination (ELRC) action in this region, provide a synopsis of ELRC workshops held in all countries of the region, identify potential holders and donors of language data suitable for improving machine translation systems, and describe the language resources collected so far. We also describe several national projects and initiatives on sharing of language data accumulated in the public sector and creation of new language resources from this data. Opportunities and challenges in consolidating language data from the public sector are discussed, and related actions and regulatory initiatives are proposed.",{"paper_id":7937,"title":7938,"year":81,"month":855,"day":63,"doi":7939,"resource_url":7940,"first_page":63,"last_page":63,"pdf_url":7941,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7942,"paper_type":860,"authors":7943,"abstract":7956},"lrec2018-main-392","LIdioms: A Multilingual Linked Idioms Data Set","10.63317\u002F2md6xukrx6x5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-392","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F46.pdf","moussallem-etal-2018-lidioms",[7944,7947,7949,7951,7953],{"paper_id":7937,"author_seq":247,"given_name":7945,"surname":7946,"affiliation":63,"orcid":63},"Diego","Moussallem",{"paper_id":7937,"author_seq":232,"given_name":7948,"surname":1299,"affiliation":63,"orcid":63},"Mohamed Ahmed",{"paper_id":7937,"author_seq":218,"given_name":7945,"surname":7950,"affiliation":63,"orcid":63},"Esteves",{"paper_id":7937,"author_seq":203,"given_name":1656,"surname":7952,"affiliation":63,"orcid":63},"Zampieri",{"paper_id":7937,"author_seq":188,"given_name":7954,"surname":7955,"affiliation":63,"orcid":63},"Axel-Cyrille","Ngonga Ngomo","In this paper, we describe the LIDIOMS data set, a multilingual RDF representation of idioms currently containing five languages: English, German, Italian, Portuguese, and Russian. The data set is intended to support natural language processing applications by providing links between idioms across languages. The underlying data was crawled and integrated from various sources. To ensure the quality of the crawled data, all idioms were evaluated by at least two native speakers. Herein, we present the model devised for structuring the data. We also provide the details of linking LIDIOMS to well-known multilingual data sets such as BabelNet. The resulting data set complies with best practices according to Linguistic Linked Open Data Community.",{"paper_id":7958,"title":7959,"year":81,"month":855,"day":63,"doi":7960,"resource_url":7961,"first_page":63,"last_page":63,"pdf_url":7962,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7963,"paper_type":860,"authors":7964,"abstract":7973},"lrec2018-main-393","Annotating Modality Expressions and Event Factuality for a Japanese Chess Commentary Corpus","10.63317\u002F48p4666hoiij","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-393","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F54.pdf","matsuyoshi-etal-2018-annotating",[7965,7968,7971,7972],{"paper_id":7958,"author_seq":247,"given_name":7966,"surname":7967,"affiliation":63,"orcid":63},"Suguru","Matsuyoshi",{"paper_id":7958,"author_seq":232,"given_name":7969,"surname":7970,"affiliation":63,"orcid":63},"Hirotaka","Kameko",{"paper_id":7958,"author_seq":218,"given_name":6160,"surname":6161,"affiliation":63,"orcid":63},{"paper_id":7958,"author_seq":203,"given_name":6154,"surname":3104,"affiliation":63,"orcid":63},"In recent years, there has been a surge of interest in the natural language processing related to the real world, such as symbol grounding, language generation, and nonlinguistic data search by natural language queries. We argue that shogi (Japanese chess) commentaries, which are accompanied by game states, are an interesting testbed for these tasks. A commentator refers not only to the current board state but to past and future moves, and yet such references can be grounded in the game tree, possibly with the help of modern game-tree search algorithms. For this reason, we previously collected shogi commentaries together with board states and have been developing a game commentary generator. In this paper, we augment the corpus with manual annotation of modality expressions and event factuality. The annotated corpus includes 1,622 modality expressions, 5,014 event class tags and 3,092 factuality tags. It can be used to train a computer to identify words and phrases that signal factuality and to determine events with the said factuality, paving the way for grounding possible and counterfactual states.",{"paper_id":7975,"title":7976,"year":81,"month":855,"day":63,"doi":7977,"resource_url":7978,"first_page":63,"last_page":63,"pdf_url":7979,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7980,"paper_type":860,"authors":7981,"abstract":7991},"lrec2018-main-394","Annotating Chinese Light Verb Constructions according to PARSEME guidelines","10.63317\u002F4sqtpe2zfdvg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-394","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F271.pdf","jiang-etal-2018-annotating",[7982,7984,7987,7989],{"paper_id":7975,"author_seq":247,"given_name":7983,"surname":6441,"affiliation":63,"orcid":63},"Menghan",{"paper_id":7975,"author_seq":232,"given_name":7985,"surname":7986,"affiliation":63,"orcid":63},"Natalia","Klyueva",{"paper_id":7975,"author_seq":218,"given_name":7988,"surname":1585,"affiliation":63,"orcid":63},"Hongzhi",{"paper_id":7975,"author_seq":203,"given_name":7990,"surname":3506,"affiliation":63,"orcid":63},"Chu-Ren","In this paper we present a preliminary study on application of PARSEME guidelines of annotating mutiword expressions to Chinese language. We focus on one specific category - light verb constructions (LVCs). We make use of an existing resource containing Chinese light verbs and examine whether this resource fulfill the requirements of the guidelines. We make a preliminary annotation of a Chinese UD treebank in two steps: first automatically identifying potential light verbs and then manually assigning the corresponding nouns or correcting false positives.",{"paper_id":7993,"title":7994,"year":81,"month":855,"day":63,"doi":7995,"resource_url":7996,"first_page":63,"last_page":63,"pdf_url":7997,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":7998,"paper_type":860,"authors":7999,"abstract":8012},"lrec2018-main-395","Using English Baits to Catch Serbian Multi-Word Terminology","10.63317\u002F5et92fq3bkwi","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-395","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F384.pdf","krstev-etal-2018-using",[8000,8003,8006,8009],{"paper_id":7993,"author_seq":247,"given_name":8001,"surname":8002,"affiliation":63,"orcid":63},"Cvetana","Krstev",{"paper_id":7993,"author_seq":232,"given_name":8004,"surname":8005,"affiliation":63,"orcid":63},"Branislava","Šandrih",{"paper_id":7993,"author_seq":218,"given_name":8007,"surname":8008,"affiliation":63,"orcid":63},"Ranka","Stanković",{"paper_id":7993,"author_seq":203,"given_name":8010,"surname":8011,"affiliation":63,"orcid":63},"Miljana","Mladenović","In this paper we present the first results in bilingual terminology extraction. The hypothesis of our approach is that if for a source language domain terminology exists as well as a domain aligned corpus for a source and a target language, then it is possible to extract the terminology for a target language. Our approach relies on several resources and tools: aligned domain texts, domain terminology for a source language, a terminology extractor for a target language, and a tool for word and chunk alignment. In this first experiment a source language is English, a target language is Serbian, a domain is Library and Information Science for which a bilingual terminological dictionary exists. Our term extractor is based on e-dictionaries and shallow parsing, and for word alignment we use GIZA++. At the end of procedure we included a supervised binary classifier that decides whether an extracted term is a valid domain term. The classifier was evaluated in a 5-fold cross validation setting on a slightly unbalanced dataset, maintaining average F-score of 89%. After conducting the experiment our system extracted 846 different Serbian domain phrases, containing 515 Serbian phrases that were not present in the existing domain terminology.",{"paper_id":8014,"title":8015,"year":81,"month":855,"day":63,"doi":8016,"resource_url":8017,"first_page":63,"last_page":63,"pdf_url":8018,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8019,"paper_type":860,"authors":8020,"abstract":8026},"lrec2018-main-396","Construction of Large-scale English Verbal Multiword Expression Annotated Corpus","10.63317\u002F45yhfs4ifpgh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-396","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F394.pdf","kato-etal-2018-construction",[8021,8024,8025],{"paper_id":8014,"author_seq":247,"given_name":8022,"surname":8023,"affiliation":63,"orcid":63},"Akihiko","Kato",{"paper_id":8014,"author_seq":232,"given_name":3651,"surname":4122,"affiliation":63,"orcid":63},{"paper_id":8014,"author_seq":218,"given_name":3649,"surname":2464,"affiliation":63,"orcid":63},"Multiword expressions (MWEs) consist of groups of tokens, which should be treated as a single syntactic or semantic unit. In this work, we focus on verbal MWEs (VMWEs), whose accurate recognition is challenging because they could be discontinuous (e.g., take .. off). Since previous English VMWE annotations are relatively small-scale in terms of VMWE occurrences and types, we conduct large-scale annotations of VMWEs on the Wall Street Journal portion of English Ontonotes by a combination of automatic annotations and crowdsourcing. Concretely, we first construct a VMWE dictionary based on the English-language Wiktionary. After that, we collect possible VMWE occurrences in Ontonotes and filter candidates with the help of gold dependency trees, then we formalize VMWE annotations as a multiword sense disambiguation problem to exploit crowdsourcing. As a result, we annotate 7,833 VMWE instances belonging to various categories, such as phrasal verbs, light verb constructions, and semi-fixed VMWEs. We hope this large-scale VMWE-annotated resource helps to develop models for MWE recognition and dependency parsing that are aware of English MWEs. Our resource is publicly available.",{"paper_id":8028,"title":8029,"year":81,"month":855,"day":63,"doi":8030,"resource_url":8031,"first_page":63,"last_page":63,"pdf_url":8032,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8033,"paper_type":860,"authors":8034,"abstract":8049},"lrec2018-main-397","Konbitzul: an MWE-specific database for Spanish-Basque","10.63317\u002F4h3mrppewhk2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-397","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F438.pdf","inurrieta-etal-2018-konbitzul",[8035,8038,8041,8043,8046],{"paper_id":8028,"author_seq":247,"given_name":8036,"surname":8037,"affiliation":63,"orcid":63},"Uxoa","Iñurrieta",{"paper_id":8028,"author_seq":232,"given_name":8039,"surname":8040,"affiliation":63,"orcid":63},"Itziar","Aduriz",{"paper_id":8028,"author_seq":218,"given_name":3288,"surname":8042,"affiliation":63,"orcid":63},"Díaz de Ilarraza",{"paper_id":8028,"author_seq":203,"given_name":8044,"surname":8045,"affiliation":63,"orcid":63},"Gorka","Labaka",{"paper_id":8028,"author_seq":188,"given_name":8047,"surname":8048,"affiliation":63,"orcid":63},"Kepa","Sarasola","This paper presents Konbitzul, an online database of verb+noun MWEs in Spanish and Basque. It collects a list of MWEs with their translations, as well as linguistic information which is NLP-applicable: it helps to identify occurrences of MWEs in multiple morphosyn- tactic variants, and it is also useful for improving translation quality in rule-based MT. In addition to this, its user-friendly interface makes it possible to simply search for MWEs along with translations, just as in any bilingual phraseological dictionary.",{"paper_id":8051,"title":8052,"year":81,"month":855,"day":63,"doi":8053,"resource_url":8054,"first_page":63,"last_page":63,"pdf_url":8055,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8056,"paper_type":860,"authors":8057,"abstract":8072},"lrec2018-main-398","A Multilingual Test Collection for the Semantic Search of Entity Categories","10.63317\u002F4ccyf4ku5i8i","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-398","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F452.pdf","sales-etal-2018-multilingual",[8058,8059,8060,8063,8066,8069,8070,8071],{"paper_id":8051,"author_seq":247,"given_name":4777,"surname":4778,"affiliation":63,"orcid":63},{"paper_id":8051,"author_seq":232,"given_name":4782,"surname":4783,"affiliation":63,"orcid":63},{"paper_id":8051,"author_seq":218,"given_name":8061,"surname":8062,"affiliation":63,"orcid":63},"Wellington","Franco",{"paper_id":8051,"author_seq":203,"given_name":8064,"surname":8065,"affiliation":63,"orcid":63},"Bernhard","Bermeitinger",{"paper_id":8051,"author_seq":188,"given_name":8067,"surname":8068,"affiliation":63,"orcid":63},"Tiago","Cunha",{"paper_id":8051,"author_seq":172,"given_name":4785,"surname":4786,"affiliation":63,"orcid":63},{"paper_id":8051,"author_seq":155,"given_name":4788,"surname":4789,"affiliation":63,"orcid":63},{"paper_id":8051,"author_seq":138,"given_name":4791,"surname":4792,"affiliation":63,"orcid":63},"Humans naturally organise and classify the world into sets and categories. These categories expressed in natural language are present in all data artefacts from structured to unstructured data and play a fundamental role as tags, dataset predicates or ontology attributes. A better understanding of the category syntactic structure and how to match them semantically is a fundamental problem in the computational linguistics domain. Despite the high popularity of entity search, entity categories have not been receiving equivalent attention. This paper aims to present the task of semantic search of entity categories by defining, developing and making publicly available a multilingual test collection comprehending English, Portuguese and German. The test collections were designed to meet the demands of the entity search community in providing more representative and semantically complex query sets. In addition, we also provide comparative baselines and a brief analysis of the results.",{"paper_id":8074,"title":8075,"year":81,"month":855,"day":63,"doi":8076,"resource_url":8077,"first_page":63,"last_page":63,"pdf_url":8078,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8079,"paper_type":860,"authors":8080,"abstract":8085},"lrec2018-main-399","Towards the Inference of Semantic Relations in Complex Nominals: a Pilot Study","10.63317\u002F55wezcpggzy9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-399","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F884.pdf","cabezas-garcia-leon-arauz-2018-towards",[8081,8084],{"paper_id":8074,"author_seq":247,"given_name":8082,"surname":8083,"affiliation":63,"orcid":63},"Melania","Cabezas-García",{"paper_id":8074,"author_seq":232,"given_name":4053,"surname":4054,"affiliation":63,"orcid":63},"Complex nominals (CNs) (e.g. wind turbine) are very common in English specialized texts (Nakov, 2013). However, all too frequently they show similar external forms but encode different semantic relations because of noun packing. This paper describes the use of paraphrases that convey the conceptual content of English two-term CNs (Nakov and Hearst, 2006) in the domain of environmental science. The semantic analysis of CNs was complemented by the use of knowledge patterns (KPs), which are lexico-syntactic patterns that usually convey semantic relations in real texts (Meyer, 2001; Marshman, 2006). Furthermore, the constituents of CNs were semantically annotated with conceptual categories (e.g. beach [LANDFORM] erosion [PROCESS]) with a view to disambiguating the semantic relation between the constituents of the CN and developing a procedure to infer the semantic relations in these multi-word terms. The results showed that the combination of KPs and paraphrases is a helpful approach to the semantics of CNs. Accordingly, the conceptual annotation of the constituents of CNs revealed similar patterns in the formation of these complex terms, which can lead to the inference of concealed semantic relations.",{"paper_id":8087,"title":8088,"year":81,"month":855,"day":63,"doi":8089,"resource_url":8090,"first_page":63,"last_page":63,"pdf_url":8091,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8092,"paper_type":860,"authors":8093,"abstract":8101},"lrec2018-main-400","Generation of a Spanish Artificial Collocation Error Corpus","10.63317\u002F3p2wmfviji86","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-400","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F970.pdf","rodriguez-fernandez-etal-2018-generation",[8094,8096,8098],{"paper_id":8087,"author_seq":247,"given_name":2650,"surname":8095,"affiliation":63,"orcid":63},"Rodríguez-Fernández",{"paper_id":8087,"author_seq":232,"given_name":2647,"surname":8097,"affiliation":63,"orcid":63},"Carlini",{"paper_id":8087,"author_seq":218,"given_name":8099,"surname":8100,"affiliation":63,"orcid":63},"Leo","Wanner","Collocations such as heavy rain or make [a] decision are combinations of two elements where one (the base) is freely chosen, while the choice of the other (collocate) is restricted by the base. Research has consistently shown that collocations present difficulties even to the most advanced language learners, so that computational tools aimed at supporting them in the process of language learning can be of great value. However, in contrast to grammatical error detection and correction, collocation error marking and correction has not yet received the attention it deserves. This is unsurprising, considering the lack of existing collocation resources, in particular those that capture the different types of collocation errors, and the high cost of a manual creation of such resources. In this paper, we present an algorithm for the automatic generation of an artificial collocation error corpus of American English learners of Spanish that includes 17 different types of collocation errors and that can be used for automatic detection and classification of collocation errors in the writings of Spanish language learners.",{"paper_id":8103,"title":8104,"year":81,"month":855,"day":63,"doi":8105,"resource_url":8106,"first_page":63,"last_page":63,"pdf_url":8107,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8108,"paper_type":860,"authors":8109,"abstract":8114},"lrec2018-main-401","Improving a Neural-based Tagger for Multiword Expressions Identification","10.63317\u002F4faqh67w944o","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-401","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1071.pdf","varis-klyueva-2018-improving",[8110,8113],{"paper_id":8103,"author_seq":247,"given_name":8111,"surname":8112,"affiliation":63,"orcid":63},"Dušan","Variš",{"paper_id":8103,"author_seq":232,"given_name":7985,"surname":7986,"affiliation":63,"orcid":63},"In this paper, we present a set of improvements introduced to MUMULS, a tagger for the automatic detection of verbal multiword expressions. Our tagger participated in the PARSEME shared task and it was the only one based on neural networks. We show that character-level embeddings can improve the performance, mainly by reducing the out-of-vocabulary rate. Furthermore, replacing the softmax layer in the decoder by a conditional random field classifier brings additional improvements. Finally, we compare different context-aware feature representations of input tokens using various encoder architectures. The experiments on Czech show that the combination of character-level embeddings using a convolutional network, self-attentive encoding layer over the word representations and an output conditional random field classifier yields the best empirical results.",{"paper_id":8116,"title":8117,"year":81,"month":855,"day":63,"doi":8118,"resource_url":8119,"first_page":63,"last_page":63,"pdf_url":8120,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8121,"paper_type":860,"authors":8122,"abstract":8131},"lrec2018-main-402","Designing a Russian Idiom-Annotated Corpus","10.63317\u002F4h8vj3rrgyg4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-402","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1095.pdf","aharodnik-etal-2018-designing",[8123,8126,8128],{"paper_id":8116,"author_seq":247,"given_name":8124,"surname":8125,"affiliation":63,"orcid":63},"Katsiaryna","Aharodnik",{"paper_id":8116,"author_seq":232,"given_name":884,"surname":8127,"affiliation":63,"orcid":63},"Feldman",{"paper_id":8116,"author_seq":218,"given_name":8129,"surname":8130,"affiliation":63,"orcid":63},"Jing","Peng","This paper describes the development of an idiom-annotated corpus of Russian. The corpus is compiled from freely available resources online and contains texts of different genres. The idiom extraction, annotation procedure, and a pilot experiment using the new corpus are outlined in the paper. Considering the scarcity of publicly available Russian annotated corpora, the corpus is a much-needed resource that can be utilized for literary, linguistic studies, pedagogy as well as for various Natural Language Processing tasks.",{"paper_id":8133,"title":8134,"year":81,"month":855,"day":63,"doi":8135,"resource_url":8136,"first_page":63,"last_page":63,"pdf_url":8137,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8138,"paper_type":860,"authors":8139,"abstract":8144},"lrec2018-main-403","DeepTC – An Extension of DKPro Text Classification for Fostering Reproducibility of Deep Learning Experiments","10.63317\u002F5fmkk6w4k8uw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-403","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F45.pdf","horsmann-zesch-2018-deeptc",[8140,8143],{"paper_id":8133,"author_seq":247,"given_name":8141,"surname":8142,"affiliation":63,"orcid":63},"Tobias","Horsmann",{"paper_id":8133,"author_seq":232,"given_name":5014,"surname":5015,"affiliation":63,"orcid":63},"We present a deep learning extension for the multi-purpose text classification framework DKPro Text Classification (DKPro TC). DKPro TC is a flexible framework for creating easily shareable and reproducible end-to-end NLP experiments involving machine learning. We provide an overview of the current state of DKPro TC, which does not allow integration of deep learning, and discuss the necessary conceptual extensions. These extensions are based on an analysis of common deep learning setups found in the literature to support all common text classification setups, i.e. single outcome, multi outcome, and sequence classification problems. Additionally to providing an end-to-end shareable environment for deep learning experiments, we provide convenience features that take care of repetitive steps, such as pre-processing, data vectorization and pruning of embeddings. By moving a large part of this boilerplate code into DKPro TC, the actual deep learning framework code improves in readability and lowers the amount of redundant source code considerably. As proof-of-concept, we integrate Keras, DyNet, and DeepLearning4J.",{"paper_id":8146,"title":8147,"year":81,"month":855,"day":63,"doi":8148,"resource_url":8149,"first_page":63,"last_page":63,"pdf_url":8150,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8151,"paper_type":860,"authors":8152,"abstract":8159},"lrec2018-main-404","Improving Hate Speech Detection with Deep Learning Ensembles","10.63317\u002F23pfnoj4u7xi","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-404","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F292.pdf","zimmerman-etal-2018-improving",[8153,8155,8157],{"paper_id":8146,"author_seq":247,"given_name":1085,"surname":8154,"affiliation":63,"orcid":63},"Zimmerman",{"paper_id":8146,"author_seq":232,"given_name":1451,"surname":8156,"affiliation":63,"orcid":63},"Kruschwitz",{"paper_id":8146,"author_seq":218,"given_name":1367,"surname":8158,"affiliation":63,"orcid":63},"Fox","Hate speech has become a major issue that is currently a hot topic in the domain of social media.  Simultaneously, current proposed methods to address the issue raise concerns about censorship.  Broadly speaking, our research focus is the area human rights, including the development of new methods to identify and better address discrimination while protecting freedom of expression. As neural network approaches are becoming state of the art for text classification problems, an ensemble method is adapted for usage with neural networks and is presented to better classify hate speech.   Our method utilizes a publicly available embedding model, which is tested against a hate speech corpus from Twitter.  To confirm robustness of our results, we additionally test against a popular sentiment dataset.  Given our goal, we are pleased that our method has a nearly 5 point improvement in F-measure when compared to original work on a publicly available hate speech evaluation dataset.   We also note difficulties encountered with reproducibility of deep learning methods and comparison of findings from other work.  Based on our experience, more details are needed in published work reliant on deep learning methods, with additional evaluation information a consideration too.  This information is provided to foster discussion within the research community for future work.",{"paper_id":8161,"title":8162,"year":81,"month":855,"day":63,"doi":8163,"resource_url":8164,"first_page":63,"last_page":63,"pdf_url":8165,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8166,"paper_type":860,"authors":8167,"abstract":8174},"lrec2018-main-405","Distributional Term Set Expansion","10.63317\u002F4c2tjnv6uqfd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-405","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F303.pdf","cuba-gyllensten-sahlgren-2018-distributional",[8168,8171],{"paper_id":8161,"author_seq":247,"given_name":8169,"surname":8170,"affiliation":63,"orcid":63},"Amaru","Cuba Gyllensten",{"paper_id":8161,"author_seq":232,"given_name":8172,"surname":8173,"affiliation":63,"orcid":63},"Magnus","Sahlgren","This paper is a short empirical study of the performance of centrality and classification based iterative term set expansion methods for distributional semantic models. Iterative term set expansion is an interactive process using distributional semantics models where a user labels terms as belonging to some sought after term set, and a system uses this labeling to supply the user with new, candidate, terms to label, trying to maximize the number of positive examples found. While centrality based methods have a long history in term set expansion, we compare them to classification methods based on the the Simple Margin method, an Active Learning approach to classification using Support Vector Machines. Examining the performance of various centrality and classification based methods for a variety of distributional models over five different term sets, we can show that active learning based methods consistently outperform centrality based methods.",{"paper_id":8176,"title":8177,"year":81,"month":855,"day":63,"doi":8178,"resource_url":8179,"first_page":63,"last_page":63,"pdf_url":8180,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8181,"paper_type":860,"authors":8182,"abstract":8189},"lrec2018-main-406","Can Domain Adaptation be Handled as Analogies?","10.63317\u002F2ytrs3rowvno","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-406","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F323.pdf","bel-pocostales-2018-domain",[8183,8186],{"paper_id":8176,"author_seq":247,"given_name":8184,"surname":8185,"affiliation":63,"orcid":63},"Núria","Bel",{"paper_id":8176,"author_seq":232,"given_name":8187,"surname":8188,"affiliation":63,"orcid":63},"Joel","Pocostales","Aspect identification in user generated texts by supervised text classification might suffer degradation in performance when changing to other domains than the one used for training. For referring to aspects such as quality, price or customer services the vocabulary might differ and affect performance. In this paper, we present an experiment to validate a method to handle domain shifts when there is no available labeled data to retrain. The system is based on the offset method as used for solving word analogy problems in vector semantic models such as word embedding. Despite of the fact that the offset method indeed found relevant analogues in the new domain for the classifier initial selected features, the classifiers did not deliver the expected results. The analysis showed that a number of words were found as analogues for many different initial features. This phenomenon was already described in the literature as 'default words' or 'hubs'. However, our data showed that it cannot be explained in terms of word frequency or distance to the question word, as suggested.",{"paper_id":8191,"title":8192,"year":81,"month":855,"day":63,"doi":8193,"resource_url":8194,"first_page":63,"last_page":63,"pdf_url":8195,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8196,"paper_type":860,"authors":8197,"abstract":8202},"lrec2018-main-407","Author Profiling from Facebook Corpora","10.63317\u002F3eqq6rr9ddh4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-407","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F354.pdf","hsieh-etal-2018-author",[8198,8200,8201],{"paper_id":8191,"author_seq":247,"given_name":8199,"surname":4686,"affiliation":63,"orcid":63},"Fernando",{"paper_id":8191,"author_seq":232,"given_name":3815,"surname":4267,"affiliation":63,"orcid":63},{"paper_id":8191,"author_seq":218,"given_name":4264,"surname":4265,"affiliation":63,"orcid":63},"Author profiling - the computational task of prediction author's demographics from text - has been a popular research topic in the NLP field, and also the focus of a number of prominent shared tasks. Author profiling is a problem of growing importance, with applications in forensics, security, sales and many others. In recent years, text available from social networks has become a primary source for computational models of author profiling, but existing studies are still largely focused on age and gender prediction, and are in many cases limited to the use of English text. Other languages, and other author profiling tasks, remain somewhat less popular. As a means to further  this issue, in this work we present initial results of a number of author profiling tasks from a Facebook corpus in the Brazilian Portuguese language. As in previous studies, our own work will focus on both standard gender and age prediction tasks but, in addition to these, we will also address two less usual author profiling tasks, namely, predicting  an author's degree of religiosity and IT background status. The tasks are modelled by making use of different knowledge sources, and results of alternative approaches are discussed.",{"paper_id":8204,"title":8205,"year":81,"month":855,"day":63,"doi":8206,"resource_url":8207,"first_page":63,"last_page":63,"pdf_url":8208,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8209,"paper_type":860,"authors":8210,"abstract":8231},"lrec2018-main-408","Semantic Relatedness of Wikipedia Concepts – Benchmark Data and a Working Solution","10.63317\u002F4vj8kcyrbv4e","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-408","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F445.pdf","ein-dor-etal-2018-semantic",[8211,8214,8216,8219,8221,8224,8227,8230],{"paper_id":8204,"author_seq":247,"given_name":8212,"surname":8213,"affiliation":63,"orcid":63},"Liat","Ein Dor",{"paper_id":8204,"author_seq":232,"given_name":2902,"surname":8215,"affiliation":63,"orcid":63},"Halfon",{"paper_id":8204,"author_seq":218,"given_name":8217,"surname":8218,"affiliation":63,"orcid":63},"Yoav","Kantor",{"paper_id":8204,"author_seq":203,"given_name":8220,"surname":4151,"affiliation":63,"orcid":63},"Ran",{"paper_id":8204,"author_seq":188,"given_name":8222,"surname":8223,"affiliation":63,"orcid":63},"Yosi","Mass",{"paper_id":8204,"author_seq":172,"given_name":8225,"surname":8226,"affiliation":63,"orcid":63},"Ruty","Rinott",{"paper_id":8204,"author_seq":155,"given_name":8228,"surname":8229,"affiliation":63,"orcid":63},"Eyal","Shnarch",{"paper_id":8204,"author_seq":138,"given_name":1626,"surname":1627,"affiliation":63,"orcid":63},"Wikipedia is a very popular source of encyclopedic knowledge which provides highly reliable articles in a variety of domains. This richness and popularity created a strong motivation among NLP researchers to develop relatedness measures between Wikipedia concepts. In this paper, we introduce WORD (Wikipedia Oriented Relatedness Dataset), a new type of concept relatedness dataset, composed of 19,276 pairs of Wikipedia concepts. This is the first human annotated dataset of Wikipedia concepts, whose purpose is twofold. On the one hand, it can serve as a benchmark for evaluating concept-relatedness methods. On the other hand, it can be used as supervised data for developing new models for concept relatedness prediction. Among the advantages of this dataset compared to its term-relatedness counterparts, are its built-in disambiguation solution, and its richness with meaningful multiword terms. Based on this benchmark we develop a new tool, named WORT (Wikipedia Oriented Relatedness Tool), for measuring the level of relatedness between pairs of concepts. We show that the relatedness predictions ofWORT outperform state of the art methods.",{"paper_id":8233,"title":8234,"year":81,"month":855,"day":63,"doi":8235,"resource_url":8236,"first_page":63,"last_page":63,"pdf_url":8237,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8238,"paper_type":860,"authors":8239,"abstract":8248},"lrec2018-main-409","Experiments with Convolutional Neural Networks for Multi-Label Authorship Attribution","10.63317\u002F4ikgaxhcvh7s","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-409","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F535.pdf","boumber-etal-2018-experiments",[8240,8243,8245],{"paper_id":8233,"author_seq":247,"given_name":8241,"surname":8242,"affiliation":63,"orcid":63},"Dainis","Boumber",{"paper_id":8233,"author_seq":232,"given_name":8244,"surname":2381,"affiliation":63,"orcid":63},"Yifan",{"paper_id":8233,"author_seq":218,"given_name":8246,"surname":8247,"affiliation":63,"orcid":63},"Arjun","Mukherjee","We explore the use of Convolutional Neural Networks (CNNs) for multi-label Authorship Attribution (AA) problems and propose a CNN specifically designed for such tasks. By averaging the author probability distributions at sentence level for the longer documents and treating smaller documents as sentences, our multi-label design adapts to single-label datasets and various document sizes, retaining the capabilities of a traditional CNN. As a part of this work, we also create and make available to the public a multi-label Authorship Attribution dataset (MLPA-400), consisting of 400 scientific publications by 20 authors from the field of Machine Learning. Proposed Multi-label CNN is evaluated against a large number of algorithms on MLPA-400 and PAN-2012, a traditional single-label AA benchmark dataset. Experimental results demonstrate that our method outperforms several state-of-the-art models on the proposed task.",{"paper_id":8250,"title":8251,"year":81,"month":855,"day":63,"doi":8252,"resource_url":8253,"first_page":63,"last_page":63,"pdf_url":8254,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8255,"paper_type":860,"authors":8256,"abstract":8268},"lrec2018-main-410","A Fast and Accurate Vietnamese Word Segmenter","10.63317\u002F32m47vrkp2wj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-410","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F55.pdf","nguyen-etal-2018-fast",[8257,8259,8261,8264,8266],{"paper_id":8250,"author_seq":247,"given_name":8258,"surname":1724,"affiliation":63,"orcid":63},"Dat Quoc",{"paper_id":8250,"author_seq":232,"given_name":8260,"surname":1724,"affiliation":63,"orcid":63},"Dai Quoc",{"paper_id":8250,"author_seq":218,"given_name":8262,"surname":8263,"affiliation":63,"orcid":63},"Thanh","Vu",{"paper_id":8250,"author_seq":203,"given_name":1364,"surname":8265,"affiliation":63,"orcid":63},"Dras",{"paper_id":8250,"author_seq":188,"given_name":1364,"surname":8267,"affiliation":63,"orcid":63},"Johnson","We propose a novel approach to Vietnamese word segmentation. Our approach is based on the Single Classification Ripple Down Rules methodology (Compton and Jansen, 1990), where rules are stored in an exception structure and new rules are only added to correct segmentation errors given by existing rules. Experimental results on the benchmark Vietnamese treebank show that our approach outperforms previous state-of-the-art approaches JVnSegmenter, vnTokenizer, DongDu and UETsegmenter in terms of both accuracy and performance speed. Our code is open-source and available at: https:\u002F\u002Fgithub.com\u002Fdatquocnguyen\u002FRDRsegmenter.",{"paper_id":8270,"title":8271,"year":81,"month":855,"day":63,"doi":8272,"resource_url":8273,"first_page":63,"last_page":63,"pdf_url":8274,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8275,"paper_type":860,"authors":8276,"abstract":8289},"lrec2018-main-411","Finite-state morphological analysis for Gagauz","10.63317\u002F25xmsmgzo3pt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-411","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F70.pdf","bayatli-etal-2018-finite",[8277,8280,8283,8286],{"paper_id":8270,"author_seq":247,"given_name":8278,"surname":8279,"affiliation":63,"orcid":63},"Sevilay","Bayatli",{"paper_id":8270,"author_seq":232,"given_name":8281,"surname":8282,"affiliation":63,"orcid":63},"Güllü","Karanfil",{"paper_id":8270,"author_seq":218,"given_name":8284,"surname":8285,"affiliation":63,"orcid":63},"Memduh","Gökırmak",{"paper_id":8270,"author_seq":203,"given_name":8287,"surname":8288,"affiliation":63,"orcid":63},"Francis M.","Tyers","This paper describes a finite-state approach to morphological analysis and generation of Gagauz, a Turkic language spoken in the Republic of Moldova. Finite-state approaches are commonly used in morphological modelling, but one of the novelties of our approach is that we explicitly handle orthographic errors and variance, in addition to loan words. The resulting model has a reasonable coverage (above 90\\%) over a range of freely-available corpora.",{"paper_id":8291,"title":8292,"year":81,"month":855,"day":63,"doi":8293,"resource_url":8294,"first_page":63,"last_page":63,"pdf_url":8295,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8296,"paper_type":860,"authors":8297,"abstract":8302},"lrec2018-main-412","Albanian Part-of-Speech Tagging: Gold Standard and Evaluation","10.63317\u002F3s8vx7a3wnzd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-412","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F89.pdf","kabashi-proisl-2018-albanian",[8298,8301],{"paper_id":8291,"author_seq":247,"given_name":8299,"surname":8300,"affiliation":63,"orcid":63},"Besim","Kabashi",{"paper_id":8291,"author_seq":232,"given_name":1615,"surname":2964,"affiliation":63,"orcid":63},"In this paper, we present a gold standard corpus for Albanian part-of-speech tagging and perform evaluation experiments with different statistical taggers. The corpus consists of more than 31,000 tokens and has been manually annotated with a medium-sized tagset that can adequately represent the syntagmatic aspects of the language. We provide mappings from the full tagset to both the original Google Universal Part-of-Speech Tags and the variant used in the Universal Dependencies project. We perform experiments with different taggers on the full tagset as well as on the coarser tagsets and achieve accuracies of up to 95.10%.",{"paper_id":8304,"title":8305,"year":81,"month":855,"day":63,"doi":8306,"resource_url":8307,"first_page":63,"last_page":63,"pdf_url":8308,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8309,"paper_type":860,"authors":8310,"abstract":8315},"lrec2018-main-413","Morphology Injection for English-Malayalam Statistical Machine Translation","10.63317\u002F4dbacuiz3vcs","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-413","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F125.pdf","s-bhattacharyya-2018-morphology",[8311,8314],{"paper_id":8304,"author_seq":247,"given_name":8312,"surname":8313,"affiliation":63,"orcid":63},"Sreelekha","S",{"paper_id":8304,"author_seq":232,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},"Statistical Machine Translation (SMT) approaches fails to handle the rich morphology when translating into morphologically rich languages. This is due to the data sparsity, which is the missing of the morphologically inflected forms of words from the parallel corpus. We investigated a method to generate these unseen morphological forms. In this paper, we analyze the morphological complexity of a morphologically rich Indian language Malayalam when translating from English. Being a highly agglutinative language, it is very difficult to generate the various morphological inflected forms for Malayalam. We study both the factor based models and the phrase based models and the problem of data sparseness. We propose a simple and effective solution based on enriching the parallel corpus with generated morphological forms. We verify this approach with various experiments on English-Malayalam SMT. We observes that the morphology injection method improves the quality of the translation. We have analyzed the experimental results both in terms of automatic and subjective evaluations.",{"paper_id":8317,"title":8318,"year":81,"month":855,"day":63,"doi":8319,"resource_url":8320,"first_page":63,"last_page":63,"pdf_url":8321,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8322,"paper_type":860,"authors":8323,"abstract":8327},"lrec2018-main-414","The Morpho-syntactic Annotation of Animacy for a Dependency Parser","10.63317\u002F3v6n7weae8eb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-414","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F150.pdf","attia-etal-2018-morpho",[8324,8325,8326],{"paper_id":8317,"author_seq":247,"given_name":1166,"surname":1167,"affiliation":63,"orcid":63},{"paper_id":8317,"author_seq":232,"given_name":4869,"surname":4870,"affiliation":63,"orcid":63},{"paper_id":8317,"author_seq":218,"given_name":2863,"surname":2864,"affiliation":63,"orcid":63},"In this paper we present the annotation scheme and parser results of the animacy feature in Russian and Arabic, two morphologicallyrich languages, in the spirit of the universal dependency framework (McDonald et al., 2013; de Marneffe et al., 2014). We explain the animacy hierarchies in both languages and make the case for the existence of five animacy types. We train a morphological analyzer on the annotated data and the results show a prediction f-measure for animacy of 95.39% for Russian and 92.71% for Arabic. We also use animacy along with other morphological tags as features to train a dependency parser, and the results show a slight improvement gained from animacy. We compare the impact of animacy on improving the dependency parser to other features found in nouns, namely, ‘gender’, ‘number’, and ‘case’. To our knowledge this is the first contrastive study of the impact of morphological features on the accuracy of a transition parser. A portion of our data (1,000 sentences for Arabic and Russian each, along with other languages) annotated according to the scheme described in this paper is made publicly available (https:\u002F\u002Flindat.mff.cuni.cz\u002Frepository\u002Fxmlui\u002Fhandle\u002F11234\u002F1-1983) as part of the CoNLL 2017 Shared Task on Multilingual Parsing (Zeman et al., 2017).",{"paper_id":8329,"title":8330,"year":81,"month":855,"day":63,"doi":8331,"resource_url":8332,"first_page":63,"last_page":63,"pdf_url":8333,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8334,"paper_type":860,"authors":8335,"abstract":8350},"lrec2018-main-415","MADARi: A Web Interface for Joint Arabic Morphological Annotation and Spelling Correction","10.63317\u002F5nejcfwotfzc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-415","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F350.pdf","obeid-etal-2018-madari",[8336,8339,8342,8343,8346,8347],{"paper_id":8329,"author_seq":247,"given_name":8337,"surname":8338,"affiliation":63,"orcid":63},"Ossama","Obeid",{"paper_id":8329,"author_seq":232,"given_name":8340,"surname":8341,"affiliation":63,"orcid":63},"Salam","Khalifa",{"paper_id":8329,"author_seq":218,"given_name":3646,"surname":3647,"affiliation":63,"orcid":63},{"paper_id":8329,"author_seq":203,"given_name":8344,"surname":8345,"affiliation":63,"orcid":63},"Houda","Bouamor",{"paper_id":8329,"author_seq":188,"given_name":3042,"surname":3043,"affiliation":63,"orcid":63},{"paper_id":8329,"author_seq":172,"given_name":8348,"surname":8349,"affiliation":63,"orcid":63},"Kemal","Oflazer","In this paper, we introduce MADARi, a joint morphological annotation and spelling correction system for texts in Standard and Dialectal Arabic. The MADARi framework provides intuitive interfaces for annotating text and managing the annotation process of a large number of sizable documents. Morphological annotation includes indicating, for a word, in context, its baseword, clitics, part-of-speech, lemma, gloss, and dialect identification. MADARi has a suite of utilities to help with annotator productivity. For example, annotators are provided with pre-computed analyses to assist them in their task and reduce the amount of work needed to complete it. MADARi also allows annotators to query a morphological analyzer for a list of possible analyses in multiple dialects or look up previously submitted analyses. The MADARi management interface enables a lead annotator to easily manage and organize the whole annotation process remotely and concurrently. We describe the motivation, design and implementation of this interface; and we present details from a user study working with this system.",{"paper_id":8352,"title":8353,"year":81,"month":855,"day":63,"doi":8354,"resource_url":8355,"first_page":63,"last_page":63,"pdf_url":8356,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8357,"paper_type":860,"authors":8358,"abstract":8364},"lrec2018-main-416","A Morphological Analyzer for St. Lawrence Island \u002F Central Siberian Yupik","10.63317\u002F5hii23dhauqb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-416","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F358.pdf","chen-schwartz-2018-morphological",[8359,8361],{"paper_id":8352,"author_seq":247,"given_name":8360,"surname":2401,"affiliation":63,"orcid":63},"Emily",{"paper_id":8352,"author_seq":232,"given_name":8362,"surname":8363,"affiliation":63,"orcid":63},"Lane","Schwartz","St. Lawrence Island \u002F Central Siberian Yupik is an endangered language, indigenous to St. Lawrence Island in Alaska and the Chukotka Peninsula of Russia, that exhibits pervasive agglutinative and polysynthetic properties. This paper discusses an implementation of a finite-state morphological analyzer for Yupik that was developed in accordance with the grammatical standards and phenomena documented in Jacobson’s 2001 reference grammar for Yupik. The analyzer was written in foma, an open source framework for constructing finite-state grammars of morphology. The approach presented here cyclically interweaves morphology and phonology to account for the language’s intricate morphophonological system, an approach that may be applicable to typologically similar languages. The morphological analyzer has been designed to serve as foundational resource that will eventually underpin a suite of computational tools for Yupik to assist in the process of linguistic documentation and revitalization.",{"paper_id":8366,"title":8367,"year":81,"month":855,"day":63,"doi":8368,"resource_url":8369,"first_page":63,"last_page":63,"pdf_url":8370,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8371,"paper_type":860,"authors":8372,"abstract":8392},"lrec2018-main-417","Universal Morphologies for the Caucasus region","10.63317\u002F3opez2w6dc3t","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-417","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F421.pdf","chiarcos-etal-2018-universal",[8373,8374,8377,8379,8382,8385,8388,8390],{"paper_id":8366,"author_seq":247,"given_name":904,"surname":2684,"affiliation":63,"orcid":63},{"paper_id":8366,"author_seq":232,"given_name":8375,"surname":8376,"affiliation":63,"orcid":63},"Kathrin","Donandt",{"paper_id":8366,"author_seq":218,"given_name":4483,"surname":8378,"affiliation":63,"orcid":63},"Ionov",{"paper_id":8366,"author_seq":203,"given_name":8380,"surname":8381,"affiliation":63,"orcid":63},"Monika","Rind-Pawlowski",{"paper_id":8366,"author_seq":188,"given_name":8383,"surname":8384,"affiliation":63,"orcid":63},"Hasmik","Sargsian",{"paper_id":8366,"author_seq":172,"given_name":8386,"surname":8387,"affiliation":63,"orcid":63},"Jesse","Wichers Schreur",{"paper_id":8366,"author_seq":155,"given_name":3326,"surname":8389,"affiliation":63,"orcid":63},"Abromeit",{"paper_id":8366,"author_seq":138,"given_name":904,"surname":8391,"affiliation":63,"orcid":63},"Fäth","The Caucasus region is famed for its rich and diverse arrays of languages and language families, often challenging European-centered views established in traditional linguistics. In this paper, we describe ongoing efforts to improve the coverage of Universal Morphologies for languages of the Caucasus region. The Universal Morphologies (UniMorph) are a recent community project aiming to complement the Universal Dependencies which focus on morphosyntax and syntax. We describe the development of UniMorph resources for Nakh-Daghestanian and Kartvelian languages as a well as for Classical Armenian, we discuss challenges that the complex morphology of these and related languages poses to the current design of UniMorph, and suggest possibilities to improve the applicability of UniMorph for languages of the Caucasus region in particular and for low resource languages in general. We also criticize the UniMorph TSV format for its limited expressiveness, and suggest to complement the existing UniMorph workflow with support for additional source formats on grounds of Linked Open Data technology.",{"paper_id":8394,"title":8395,"year":81,"month":855,"day":63,"doi":8396,"resource_url":8397,"first_page":63,"last_page":63,"pdf_url":8398,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8399,"paper_type":860,"authors":8400,"abstract":8404},"lrec2018-main-418","EMTC: Multilabel Corpus in Movie Domain for Emotion Analysis in Conversational Text","10.63317\u002F3rbax7z34fmx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-418","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F405.pdf","phan-matsumoto-2018-emtc",[8401,8403],{"paper_id":8394,"author_seq":247,"given_name":8402,"surname":7157,"affiliation":63,"orcid":63},"Duc-Anh",{"paper_id":8394,"author_seq":232,"given_name":3649,"surname":2464,"affiliation":63,"orcid":63},"It is proved that in text-based communication such as sms, messengers applications, misinterpretation of partner’s emotions are pretty common. In order to tackle this problem, we propose a new multilabel corpus named Emotional Movie Transcript Corpus (EMTC). Unlike most of the existing emotion corpora that are collected from Twitters and use hashtags labels, our corpus includes conversations from movie with more than 2.1 millions utterances which are partly annotated by ourselves and independent annotators. To our intuition, conversations from movies are closer to real-life settings and emotionally richer. We believe that a corpus like EMTC will greatly benefit the development and evaluation of emotion analysis systems and improve their ability to express and interpret emotions in text-based communication.",{"paper_id":8406,"title":8407,"year":81,"month":855,"day":63,"doi":8408,"resource_url":8409,"first_page":63,"last_page":63,"pdf_url":8410,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8411,"paper_type":860,"authors":8412,"abstract":8419},"lrec2018-main-419","Complex and Precise Movie and Book Annotations in French Language for Aspect Based Sentiment Analysis","10.63317\u002F4krerhuhuw9y","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-419","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F449.pdf","pecore-villaneau-2018-complex",[8413,8416],{"paper_id":8406,"author_seq":247,"given_name":8414,"surname":8415,"affiliation":63,"orcid":63},"Stefania","Pecore",{"paper_id":8406,"author_seq":232,"given_name":8417,"surname":8418,"affiliation":63,"orcid":63},"Jeanne","Villaneau","Aspect Based Sentiment Analysis (ABSA) aims at collecting detailed opinion information according to products and their features, via the recognition of targets of the opinions in text.  Though some annotated data have been produced in challenges as SemEval, resources are still scarce, especially for languages other than English.  We are interested in enhancing today’s mostly statistical text classification with the use of linguistics tools, in order to better define and analyze what has been written. The work presented in this paper focuses on two French datasets of movies and books online reviews. In reviews, text length is much higher compared to a tweet, giving us the opportunity to work on a challenging and linguistically interesting dataset. Moreover, movies and books are products that make classifying opinions into aspects quite complex.  This article provides an analysis of the particularities of the two domains during the process of collecting and annotating data, a precise annotation scheme for each domain, examples and statistics issued from the annotation phase, and some perspectives on our future work.",{"paper_id":8421,"title":8422,"year":81,"month":855,"day":63,"doi":8423,"resource_url":8424,"first_page":63,"last_page":63,"pdf_url":8425,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8426,"paper_type":860,"authors":8427,"abstract":8432},"lrec2018-main-420","Lingmotif-lex: a Wide-coverage, State-of-the-art Lexicon for Sentiment Analysis","10.63317\u002F52vz4w6a5vis","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-420","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F457.pdf","moreno-ortiz-perez-hernandez-2018-lingmotif",[8428,8430],{"paper_id":8421,"author_seq":247,"given_name":1753,"surname":8429,"affiliation":63,"orcid":63},"Moreno-Ortiz",{"paper_id":8421,"author_seq":232,"given_name":4164,"surname":8431,"affiliation":63,"orcid":63},"Pérez-Hernández","We present Lingmotif-lex, a new, wide-coverage, domain-neutral lexicon for sentiment analysis in English. We describe the creation process of this resource, its assumptions, format, and valence system. Unlike most sentiment lexicons currently available, Lingmotif-lex places strong emphasis on multi-word expressions, and has been manually curated to be as accurate, unambiguous, and comprehensive as possible. Also unlike existing available resources, \\textit{Lingmotif-lex} comprises a comprehensive set of contextual valence shifters (CVS) that account for valence modification by context. Formal evaluation is provided by testing it on two publicly available sentiment analysis datasets, and comparing it with other English sentiment lexicons available, which we adapted to make this comparison as fair as possible. We show how Lingmotif-lex achieves significantly better performance than these lexicons across both datasets.",{"paper_id":8434,"title":8435,"year":81,"month":855,"day":63,"doi":8436,"resource_url":8437,"first_page":63,"last_page":63,"pdf_url":8438,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8439,"paper_type":860,"authors":8440,"abstract":8446},"lrec2018-main-421","A Japanese Corpus for Analyzing Customer Loyalty Information","10.63317\u002F4mpwdq7y5f68","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-421","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F465.pdf","wang-tahara-2018-japanese",[8441,8443],{"paper_id":8434,"author_seq":247,"given_name":8442,"surname":1588,"affiliation":63,"orcid":63},"Yiou",{"paper_id":8434,"author_seq":232,"given_name":8444,"surname":8445,"affiliation":63,"orcid":63},"Takuji","Tahara","Today customers voice attitudes, opinions and their experience about some brands, companies, products or services through center calls, web reviews or SNS and analyzing them is an important task. On the other hand, customer loyalty has long been a topic of high interest in both academia and industry. Therefore, it is attractive to consider exploiting customer loyalty information by analyzing the voice of customer. However, although many previous studies focused on analyzing attitudes, opinions, sentiments of the text data, no work has been conducted from the perspective of customer loyalty, which is reflected by a combination of customer attitudes and behavior. In this work, we present JCLIC, Japanese Customer Loyalty Information Corpus, which is a corpus for analyzing customer loyalty information. For each review we have annotated detailed customer loyalty information which contains: loyalty degree that reflects loyalty level of the customer, loyalty expression that expresses the customer loyalty, loyalty type that indicates the category to which loyalty expression belongs., reason expression that expresses why the customer have such loyalty degree, and reason type that indicates the category to which reason expression belongs.. We describe our annotation scheme and annotation process, present results of an agreement study and give some statistics about the corpus we have annotated.",{"paper_id":8448,"title":8449,"year":81,"month":855,"day":63,"doi":8450,"resource_url":8451,"first_page":63,"last_page":63,"pdf_url":8452,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8453,"paper_type":860,"authors":8454,"abstract":8466},"lrec2018-main-422","FooTweets: A Bilingual Parallel Corpus of World Cup Tweets","10.63317\u002F3p9mvhv66xai","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-422","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F471.pdf","sluyter-gathje-etal-2018-footweets",[8455,8458,8461,8464],{"paper_id":8448,"author_seq":247,"given_name":8456,"surname":8457,"affiliation":63,"orcid":63},"Henny","Sluyter-Gäthje",{"paper_id":8448,"author_seq":232,"given_name":8459,"surname":8460,"affiliation":63,"orcid":63},"Pintu","Lohar",{"paper_id":8448,"author_seq":218,"given_name":8462,"surname":8463,"affiliation":63,"orcid":63},"Haithem","Afli",{"paper_id":8448,"author_seq":203,"given_name":6545,"surname":8465,"affiliation":63,"orcid":63},"Way","The way information spreads through society has changed significantly over the past decade with the advent of online social networking. Twitter, one of the most widely used social networking websites, is known as the real-time, public microblogging network where news breaks first. Most users love it for its iconic 140-character limitation and unfiltered feed that show them news and opinions in the form of tweets. Tweets are usually multilingual in nature and of varying quality. However, machine translation (MT) of twitter data is a challenging task especially due to the following two reasons: (i) tweets are informal in nature (i.e., violates linguistic norms), and (ii) parallel resource for twitter data is scarcely available on the Internet. In this paper, we develop FooTweets, a first parallel corpus of tweets for English–German language pair. We extract 4, 000 English tweets from the FIFA 2014 world cup and manually translate them into German with a special focus on the informal nature of the tweets. In addition to this, we also annotate sentiment scores between 0 and 1 to all the tweets depending upon the degree of sentiment associated with them. This data has recently been used to build sentiment translation engines and an extensive evaluation revealed that such a resource is very useful in machine translation of user generated content.",{"paper_id":8468,"title":8469,"year":81,"month":855,"day":63,"doi":8470,"resource_url":8471,"first_page":63,"last_page":63,"pdf_url":8472,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8473,"paper_type":860,"authors":8474,"abstract":8482},"lrec2018-main-423","The SSIX Corpora: Three Gold Standard Corpora for Sentiment Analysis in English, Spanish and German Financial Microblogs","10.63317\u002F5j87b5pksr6f","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-423","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F564.pdf","gaillat-etal-2018-ssix",[8475,8477,8480,8481],{"paper_id":8468,"author_seq":247,"given_name":1615,"surname":8476,"affiliation":63,"orcid":63},"Gaillat",{"paper_id":8468,"author_seq":232,"given_name":8478,"surname":8479,"affiliation":63,"orcid":63},"Manel","Zarrouk",{"paper_id":8468,"author_seq":218,"given_name":4788,"surname":4789,"affiliation":63,"orcid":63},{"paper_id":8468,"author_seq":203,"given_name":4785,"surname":4786,"affiliation":63,"orcid":63},"This paper introduces the three SSIX corpora for sentiment analysis. These corpora address the need to provide annotated data for supervised learning methods. They focus on stock-market related messages extracted from two financial microblog platforms, i.e., StockTwits and Twitter. In total they include 2,886 messages with opinion targets. These messages are provided with polarity annotation set on a continuous scale by three or four experts in each language. The annotation information identifies the targets with a sentiment score. The annotation process includes manual annotation verified and consolidated by financial experts. The creation of the annotated corpora took into account principled sampling strategies as well as inter-annotator agreement before consolidation in order to maximize data quality.",{"paper_id":8484,"title":8485,"year":81,"month":855,"day":63,"doi":8486,"resource_url":8487,"first_page":63,"last_page":63,"pdf_url":8488,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8489,"paper_type":860,"authors":8490,"abstract":8500},"lrec2018-main-424","Sarcasm Target Identification: Dataset and An Introductory Approach","10.63317\u002F2xck5kccxygt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-424","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F583.pdf","joshi-etal-2018-sarcasm",[8491,8494,8497,8498],{"paper_id":8484,"author_seq":247,"given_name":8492,"surname":8493,"affiliation":63,"orcid":63},"Aditya","Joshi",{"paper_id":8484,"author_seq":232,"given_name":8495,"surname":8496,"affiliation":63,"orcid":63},"Pranav","Goel",{"paper_id":8484,"author_seq":218,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},{"paper_id":8484,"author_seq":203,"given_name":1364,"surname":8499,"affiliation":63,"orcid":63},"Carman","Past work in computational sarcasm deals primarily with sarcasm detection. In this paper, we introduce a novel, related problem: sarcasm target identification (i.e., extracting the target of ridicule in a sarcastic sentence). As a benchmark, we introduce a new dataset for the task. This dataset is manually annotated for the sarcasm target in book snippets and tweets based on our formulation of the task. We then introduce an automatic approach for sarcasm target identification. It is based on a combination of two types of extractors: one based on rules, and another consisting of a statistical classifier.  Our introductory approach establishes the viability of sarcasm target identification, and will serve as a baseline for future work.",{"paper_id":8502,"title":8503,"year":81,"month":855,"day":63,"doi":8504,"resource_url":8505,"first_page":63,"last_page":63,"pdf_url":8506,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8507,"paper_type":860,"authors":8508,"abstract":8522},"lrec2018-main-425","Annotating Opinions and Opinion Targets in Student Course Feedback","10.63317\u002F5ko6tp2e46eg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-425","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F726.pdf","chathuranga-etal-2018-annotating",[8509,8512,8515,8518,8521],{"paper_id":8502,"author_seq":247,"given_name":8510,"surname":8511,"affiliation":63,"orcid":63},"Janaka","Chathuranga",{"paper_id":8502,"author_seq":232,"given_name":8513,"surname":8514,"affiliation":63,"orcid":63},"Shanika","Ediriweera",{"paper_id":8502,"author_seq":218,"given_name":8516,"surname":8517,"affiliation":63,"orcid":63},"Ravindu","Hasantha",{"paper_id":8502,"author_seq":203,"given_name":8519,"surname":8520,"affiliation":63,"orcid":63},"Pranidhith","Munasinghe",{"paper_id":8502,"author_seq":188,"given_name":5697,"surname":5698,"affiliation":63,"orcid":63},"In this paper, we present a student course feedback corpus, a novel resource for opinion target extraction and sentiment analysis. The corpus is developed with the main aim of summarizing general feedback given by students on undergraduate-level courses. In this corpus, opinion targets, opinion expressions, and polarities of the opinion expressions towards the opinion targets are annotated. Opinion targets are basically the important key points in feedback that the students have shown their sentiment towards, such as “Lecture Slides”, and “Teaching Style”. The uniqueness of the corpus, annotation methodology, difficulties faced during annotating, and possible usages of the corpus are discussed in this paper.",{"paper_id":8524,"title":8525,"year":81,"month":855,"day":63,"doi":8526,"resource_url":8527,"first_page":63,"last_page":63,"pdf_url":8528,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8529,"paper_type":860,"authors":8530,"abstract":8543},"lrec2018-main-426","Generating a Gold Standard for a Swedish Sentiment Lexicon","10.63317\u002F439usxg8i7vq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-426","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F846.pdf","rouces-etal-2018-generating",[8531,8534,8537,8540],{"paper_id":8524,"author_seq":247,"given_name":8532,"surname":8533,"affiliation":63,"orcid":63},"Jacobo","Rouces",{"paper_id":8524,"author_seq":232,"given_name":8535,"surname":8536,"affiliation":63,"orcid":63},"Nina","Tahmasebi",{"paper_id":8524,"author_seq":218,"given_name":8538,"surname":8539,"affiliation":63,"orcid":63},"Lars","Borin",{"paper_id":8524,"author_seq":203,"given_name":8541,"surname":8542,"affiliation":63,"orcid":63},"Stian","Rødven Eide","There is an increasing demand for multilingual sentiment analysis, and most work on sentiment lexicons is still carried out based on English lexicons like WordNet. In addition, many of the non-English sentiment lexicons that do exist have been compiled by (machine) translation from English resources, thereby arguably obscuring possible language-specific characteristics of sentiment-loaded vocabulary. In this paper we describe the creation of a gold standard for the sentiment annotation of Swedish terms as a first step towards the creation of a full-fledged sentiment lexicon for Swedish -- i.e., a lexicon containing information about \\emph{prior} sentiment (also called polarity) values of lexical items (words or disambiguated word senses), along a scale negative--positive. We create a gold standard for sentiment annotation of Swedish terms, using the freely available SALDO lexicon and the Gigaword corpus. For this purpose, we employ a multi-stage approach combining corpus-based frequency sampling and two stages of human annotation: direct score annotation followed by Best-Worst Scaling. In addition to obtaining a gold standard, we analyze the data from our process and we draw conclusions about the optimal sentiment model.",{"paper_id":8545,"title":8546,"year":81,"month":855,"day":63,"doi":8547,"resource_url":8548,"first_page":63,"last_page":63,"pdf_url":8549,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8550,"paper_type":860,"authors":8551,"abstract":8560},"lrec2018-main-427","WordKit: a Python Package for Orthographic and Phonological Featurization","10.63317\u002F5jvnnyc7kptq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-427","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F249.pdf","tulkens-etal-2018-wordkit",[8552,8555,8557],{"paper_id":8545,"author_seq":247,"given_name":8553,"surname":8554,"affiliation":63,"orcid":63},"Stéphan","Tulkens",{"paper_id":8545,"author_seq":232,"given_name":8556,"surname":6270,"affiliation":63,"orcid":63},"Dominiek",{"paper_id":8545,"author_seq":218,"given_name":8558,"surname":8559,"affiliation":63,"orcid":63},"Walter","Daelemans","The modeling of psycholinguistic phenomena, such as word reading, with machine learning techniques requires the featurization of word stimuli into appropriate orthographic and phonological representations. Critically, the choice of features impacts the performance of machine learning algorithms, and can have important ramifications for the conclusions drawn from a model. As such, featurizing words with a variety of feature sets, without having to resort to using different tools is beneficial development. In this work, we present wordkit, a python package which allows users to switch between feature sets and featurizers with a uniform API, allowing for rapid prototyping. To the best of our knowledge, this is the first package which integrates a variety of orthographic and phonological featurizers in a single package. The package is fully compatible with scikit-learn, and hence can be integrated into other pipelines. Furthermore, the package is modular and extensible, allowing for the integration of a large variety of feature sets and featurizers. The package and documentation can be found at github.com\u002Fstephantul\u002Fwordkit",{"paper_id":8562,"title":8563,"year":81,"month":855,"day":63,"doi":8564,"resource_url":8565,"first_page":63,"last_page":63,"pdf_url":8566,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8567,"paper_type":860,"authors":8568,"abstract":8578},"lrec2018-main-428","Pronunciation Variants and ASR of Colloquial Speech: A Case Study on Czech","10.63317\u002F5e4zmxtg7k57","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-428","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F833.pdf","lukes-etal-2018-pronunciation",[8569,8571,8573,8576],{"paper_id":8562,"author_seq":247,"given_name":1199,"surname":8570,"affiliation":63,"orcid":63},"Lukeš",{"paper_id":8562,"author_seq":232,"given_name":4656,"surname":8572,"affiliation":63,"orcid":63},"Kopřivová",{"paper_id":8562,"author_seq":218,"given_name":8574,"surname":8575,"affiliation":63,"orcid":63},"Zuzana","Komrsková",{"paper_id":8562,"author_seq":203,"given_name":7585,"surname":8577,"affiliation":63,"orcid":63},"Poukarová","A standard ASR system is built using three types of mutually related language resources: apart from speech recordings and orthographic transcripts, a pronunciation component maps tokens in the transcripts to their phonetic representations. Its implementation is either lexicon-based (whether by way of simple lookup or of a stochastic grapheme-to-phoneme converter trained on the source lexicon) or rule-based, or a hybrid thereof. Whichever approach ends up being taken (as determined primarily by the writing system of the language in question), little attention is usually paid to pronunciation variants stemming from connected speech processes, hypoarticulation, and other phenomena typical for colloquial speech, mostly because the resource is seldom directly empirically derived. This paper presents a case study on the automatic recognition of colloquial Czech, using a pronunciation dictionary extracted from the ORTOFON corpus of informal spontaneous Czech, which is manually phonetically transcribed. The performance of the dictionary is compared to a standard rule-based pronunciation component, as evaluated against a subset of the ORTOFON corpus (multiple speakers recorded on a single compact device) and the Vystadial telephone speech corpus, for which prior benchmarks are available.",{"paper_id":8580,"title":8581,"year":81,"month":855,"day":63,"doi":8582,"resource_url":8583,"first_page":63,"last_page":63,"pdf_url":8584,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8585,"paper_type":860,"authors":8586,"abstract":8595},"lrec2018-main-429","Epitran: Precision G2P for Many Languages","10.63317\u002F57n9v6em7ihu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-429","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F890.pdf","mortensen-etal-2018-epitran",[8587,8590,8593],{"paper_id":8580,"author_seq":247,"given_name":8588,"surname":8589,"affiliation":63,"orcid":63},"David R.","Mortensen",{"paper_id":8580,"author_seq":232,"given_name":8591,"surname":8592,"affiliation":63,"orcid":63},"Siddharth","Dalmia",{"paper_id":8580,"author_seq":218,"given_name":2451,"surname":8594,"affiliation":63,"orcid":63},"Littell","Epitran is a massively multilingual, multiple back-end system for G2P (grapheme-to-phoneme) transduction which is distributed with support for 61 languages. It takes word tokens in the orthography of a language and outputs a phonemic representation in either IPA or X-SAMPA. The main system is written in Python and is publicly available as open source software. Its efficacy has been demonstrated in multiple research projects relating to language transfer, polyglot models, and speech. In a particular ASR task, Epitran was shown to improve the word error rate over Babel baselines for acoustic modeling.",{"paper_id":8597,"title":8598,"year":81,"month":855,"day":63,"doi":8599,"resource_url":8600,"first_page":63,"last_page":63,"pdf_url":8601,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8602,"paper_type":860,"authors":8603,"abstract":8618},"lrec2018-main-430","A Multilingual Approach to Question Classification","10.63317\u002F339mh28dgi2e","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-430","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F13.pdf","kalouli-etal-2018-multilingual",[8604,8607,8610,8613,8615],{"paper_id":8597,"author_seq":247,"given_name":8605,"surname":8606,"affiliation":63,"orcid":63},"Aikaterini-Lida","Kalouli",{"paper_id":8597,"author_seq":232,"given_name":8608,"surname":8609,"affiliation":63,"orcid":63},"Katharina","Kaiser",{"paper_id":8597,"author_seq":218,"given_name":8611,"surname":8612,"affiliation":63,"orcid":63},"Annette","Hautli-Janisz",{"paper_id":8597,"author_seq":203,"given_name":8614,"surname":8609,"affiliation":63,"orcid":63},"Georg A.",{"paper_id":8597,"author_seq":188,"given_name":8616,"surname":8617,"affiliation":63,"orcid":63},"Miriam","Butt","In this paper we present the Konstanz Resource of Questions (KRoQ), the first dependency-parsed, parallel multilingual corpus of information-seeking and non-information-seeking questions.  In creating the corpus, we employ a linguistically motivated rule-based system that uses linguistic cues from one language to help classify and annotate questions across other languages. Our current corpus includes German, French, Spanish and Koine Greek. Based on the linguistically motivated heuristics we identify, a two-step scoring mechanism assigns intra- and inter-language scores to each question. Based on these scores, each question is classified as being either information seeking or non-information seeking. An evaluation shows that this mechanism correctly classifies questions in 79% of the cases. We release our corpus as a basis for further work in the area of question classification. It can be utilized as training and testing data for machine-learning algorithms, as corpus-data for theoretical linguistic questions or as a resource for further rule-based approaches to question identification.",{"paper_id":8620,"title":8621,"year":81,"month":855,"day":63,"doi":8622,"resource_url":8623,"first_page":63,"last_page":63,"pdf_url":8624,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8625,"paper_type":860,"authors":8626,"abstract":8638},"lrec2018-main-431","Dataset for the First Evaluation on Chinese Machine Reading Comprehension","10.63317\u002F3carcrr3s25v","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-431","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F32.pdf","cui-etal-2018-dataset",[8627,8629,8630,8632,8634,8636],{"paper_id":8620,"author_seq":247,"given_name":8628,"surname":2427,"affiliation":63,"orcid":63},"Yiming",{"paper_id":8620,"author_seq":232,"given_name":6957,"surname":5232,"affiliation":63,"orcid":63},{"paper_id":8620,"author_seq":218,"given_name":8631,"surname":2401,"affiliation":63,"orcid":63},"Zhipeng",{"paper_id":8620,"author_seq":203,"given_name":8633,"surname":3400,"affiliation":63,"orcid":63},"Wentao",{"paper_id":8620,"author_seq":188,"given_name":8635,"surname":1588,"affiliation":63,"orcid":63},"Shijin",{"paper_id":8620,"author_seq":172,"given_name":8637,"surname":3777,"affiliation":63,"orcid":63},"Guoping","Machine Reading Comprehension (MRC) has become enormously popular recently and has attracted a lot of attention. However, existing reading comprehension datasets are mostly in English. To add diversity in reading comprehension datasets, in this paper we propose a new Chinese reading comprehension dataset for accelerating related research in the community. The proposed dataset contains two different types: cloze-style reading comprehension and user query reading comprehension, associated with large-scale training data as well as human-annotated validation and hidden test set. Along with this dataset, we also hosted the first Evaluation on Chinese Machine Reading Comprehension (CMRC-2017) and successfully attracted tens of participants, which suggest the potential impact of this dataset.",{"paper_id":8640,"title":8641,"year":81,"month":855,"day":63,"doi":8642,"resource_url":8643,"first_page":63,"last_page":63,"pdf_url":8644,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8645,"paper_type":860,"authors":8646,"abstract":8652},"lrec2018-main-432","A Multi-Domain Framework for Textual Similarity. A Case Study on Question-to-Question and Question-Answering Similarity Tasks","10.63317\u002F2kqgrvgve5kb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-432","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F34.pdf","hazem-etal-2018-multi",[8647,8648,8651],{"paper_id":8640,"author_seq":247,"given_name":1797,"surname":1798,"affiliation":63,"orcid":63},{"paper_id":8640,"author_seq":232,"given_name":8649,"surname":8650,"affiliation":63,"orcid":63},"Basma","El Amal Boussaha",{"paper_id":8640,"author_seq":218,"given_name":1641,"surname":6897,"affiliation":63,"orcid":63},"Community Question Answering (CQA) websites have become a very popular and useful source of information, which helps users to find out answers to their corresponding questions. On one hand, if a user's question does not exist in the forum, a new post is created so that other users can contribute and provide answers or comments. On the other hand, if similar or related questions already exist in the forum, the system should be able to detect them  and redirect the user towards the corresponding threads. This procedure of detecting similar questions is also known as question-to-question similarity task in the NLP research community. Once the correct posts have been detected, it is important to provide the correct answer since some posts can contain tens or hundreds of answers\u002Fcomments which make the user's research more difficult. This procedure is also known as the question-answering similarity task. In this paper, we address both tasks and aim at providing the first framework on the evaluation of  similar questions and question-answering detection on a multi-domain corpora. For that purpose, we use the community question answering forum Stack-Exchange to extract posts and pairs of questions and answers from multiple domains.  We evaluate two baseline approaches over 19 domains and provide preliminary results on multiple annotated question-answering datasets to deal with question-answering similarity task.",{"paper_id":8654,"title":8655,"year":81,"month":855,"day":63,"doi":8656,"resource_url":8657,"first_page":63,"last_page":63,"pdf_url":8658,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8659,"paper_type":860,"authors":8660,"abstract":8671},"lrec2018-main-433","WorldTree: A Corpus of Explanation Graphs for Elementary Science Questions supporting Multi-hop Inference","10.63317\u002F28283yn8kzpn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-433","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F81.pdf","jansen-etal-2018-worldtree",[8661,8663,8666,8668],{"paper_id":8654,"author_seq":247,"given_name":1474,"surname":8662,"affiliation":63,"orcid":63},"Jansen",{"paper_id":8654,"author_seq":232,"given_name":8664,"surname":8665,"affiliation":63,"orcid":63},"Elizabeth","Wainwright",{"paper_id":8654,"author_seq":218,"given_name":1085,"surname":8667,"affiliation":63,"orcid":63},"Marmorstein",{"paper_id":8654,"author_seq":203,"given_name":8669,"surname":8670,"affiliation":63,"orcid":63},"Clayton","Morrison","Developing methods of automated inference that are able to provide users with compelling human-readable justifications for why the answer to a question is correct is critical for domains such as science and medicine, where user trust and detecting costly errors are limiting factors to adoption. One of the central barriers to training question answering models on explainable inference tasks is the lack of gold explanations to serve as training data. In this paper we present a corpus of explanations for standardized science exams, a recent challenge task for question answering. We manually construct a corpus of detailed explanations for nearly all publicly available standardized elementary science question (approximately 1,680 3rd through 5th grade questions) and represent these as “explanation graphs” – sets of lexically overlapping sentences that describe how to arrive at the correct answer to a question through a combination of domain and world knowledge. We also provide an explanation-centered tablestore, a collection of semi-structured tables that contain the knowledge to construct these elementary science explanations. Together, these two knowledge resources map out a substantial portion of the knowledge required for answering and explaining elementary science exams, and provide both structured and free-text training data for the explainable inference task.",{"paper_id":8673,"title":8674,"year":81,"month":855,"day":63,"doi":8675,"resource_url":8676,"first_page":63,"last_page":63,"pdf_url":8677,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8678,"paper_type":860,"authors":8679,"abstract":8694},"lrec2018-main-434","Analysis of Implicit Conditions in Database Search Dialogues","10.63317\u002F3ax3wut72unq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-434","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F345.pdf","fukunaga-etal-2018-analysis",[8680,8683,8685,8688,8691],{"paper_id":8673,"author_seq":247,"given_name":8681,"surname":8682,"affiliation":63,"orcid":63},"Shun-ya","Fukunaga",{"paper_id":8673,"author_seq":232,"given_name":7690,"surname":8684,"affiliation":63,"orcid":63},"Nishikawa",{"paper_id":8673,"author_seq":218,"given_name":8686,"surname":8687,"affiliation":63,"orcid":63},"Takenobu","Tokunaga",{"paper_id":8673,"author_seq":203,"given_name":8689,"surname":8690,"affiliation":63,"orcid":63},"Hikaru","Yokono",{"paper_id":8673,"author_seq":188,"given_name":8692,"surname":8693,"affiliation":63,"orcid":63},"Tetsuro","Takahashi","This paper reports an annotation to a corpus of database search dialogues on real estate, and the analysis on implicit information in the utterances for constructing database queries. Two annotators annotated 50 dialogues with a set of database field tags, resulting in a high inter-annotator agreement (Cohen's kappa=0.79), and the analysis revealed that 10% of the utterances included non-database-field information. We further investigated these utterances to find that more than 93% of them included useful information for figuring out search conditions, which we call the implicit condition. The contribution of this paper is to present the existence and importance of the implicit conditions in the database search dialogues and both qualitative and quantitative analysis of them. Our corpus can provide a fundamental language resource for constructing a dialogue system which can utilise the implicit conditions. The paper concluded with possible approaches to achieve our long-term goal, extracting the implicit conditions in the database search dialogues and utilising them to construct queries.",{"paper_id":8696,"title":8697,"year":81,"month":855,"day":63,"doi":8698,"resource_url":8699,"first_page":63,"last_page":63,"pdf_url":8700,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8701,"paper_type":860,"authors":8702,"abstract":8715},"lrec2018-main-435","An Information-Providing Closed-Domain Human-Agent Interaction Corpus","10.63317\u002F2tvu2ux22kyh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-435","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F429.pdf","van-waterschoot-etal-2018-information",[8703,8706,8708,8710,8713],{"paper_id":8696,"author_seq":247,"given_name":8704,"surname":8705,"affiliation":63,"orcid":63},"Jelte","van Waterschoot",{"paper_id":8696,"author_seq":232,"given_name":2218,"surname":8707,"affiliation":63,"orcid":63},"Dubuisson Duplessis",{"paper_id":8696,"author_seq":218,"given_name":2772,"surname":8709,"affiliation":63,"orcid":63},"Gatti",{"paper_id":8696,"author_seq":203,"given_name":8711,"surname":8712,"affiliation":63,"orcid":63},"Merijn","Bruijnes",{"paper_id":8696,"author_seq":188,"given_name":1924,"surname":8714,"affiliation":63,"orcid":63},"Heylen","The contribution of this paper is twofold: 1) we provide a public corpus for Human-Agent Interaction (where the agent is controlled by a Wizard of Oz) and 2) we show a study on verbal alignment in Human-Agent Interaction, to exemplify the corpus' use. In our recordings for the Human-Agent Interaction Alice-corpus (HAI Alice-corpus), participants talked to a wizarded agent, who provided them with information about the book Alice in Wonderland and its author. The wizard had immediate and almost full control over the agent's verbal and nonverbal behavior, as the wizard provided the agent's speech through his own voice and his facial expressions were directly copied onto the agent. The agent's hand gestures were controlled through a button interface. Data was collected to create a corpus with unexpected situations, such as misunderstandings, (accidental) false information, and interruptions. The HAI Alice-corpus consists of transcribed audio-video recordings of 15 conversations (more than 900 utterances) between users and the wizarded agent. As a use-case example, we measured the verbal alignment between the user and the agent. The paper contains information about the setup of the data collection, the unexpected situations and a description of our verbal alignment study.",{"paper_id":8717,"title":8718,"year":81,"month":855,"day":63,"doi":8719,"resource_url":8720,"first_page":63,"last_page":63,"pdf_url":8721,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8722,"paper_type":860,"authors":8723,"abstract":8730},"lrec2018-main-436","Augmenting Image Question Answering Dataset by Exploiting Image Captions","10.63317\u002F39gpdtauvb6o","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-436","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F480.pdf","yokota-nakayama-2018-augmenting",[8724,8727],{"paper_id":8717,"author_seq":247,"given_name":8725,"surname":8726,"affiliation":63,"orcid":63},"Masashi","Yokota",{"paper_id":8717,"author_seq":232,"given_name":8728,"surname":8729,"affiliation":63,"orcid":63},"Hideki","Nakayama","Image question answering (IQA) is one of the tasks that need rich resources, i.e. supervised data, to achieve optimal performance. However, because IQA is a challenging task that handles complex input and output information, the cost of naive manual annotation can be prohibitively expensive. On the other hand, it is thought to be relatively easy to obtain relevant pairs of an image and text in an unsupervised manner (e.g., crawling Web data). Based on this expectation, we propose a framework to augment training data for IQA by generating additional examples from unannotated pairs of an image and captions. The important constraint that a generated IQA example must satisfy is that its answer must be inferable from the corresponding image and question. To satisfy this, we first select a possible answer for a given image by randomly extracting an answer from corresponding captions. Then we generate the question from the triplets of the image, captions and fixed answer. In experiments, we test our method on the Visual Genome dataset varying the ratio of seed supervised data and demonstrate its effectiveness.",{"paper_id":8732,"title":8733,"year":81,"month":855,"day":63,"doi":8734,"resource_url":8735,"first_page":63,"last_page":63,"pdf_url":8736,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8737,"paper_type":860,"authors":8738,"abstract":8747},"lrec2018-main-437","Semi-supervised Training Data Generation for Multilingual Question Answering","10.63317\u002F3x9aotr9z8ks","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-437","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F711.pdf","lee-etal-2018-semi",[8739,8741,8744,8746],{"paper_id":8732,"author_seq":247,"given_name":8740,"surname":4032,"affiliation":63,"orcid":63},"Kyungjae",{"paper_id":8732,"author_seq":232,"given_name":8742,"surname":8743,"affiliation":63,"orcid":63},"Kyoungho","Yoon",{"paper_id":8732,"author_seq":218,"given_name":8745,"surname":5615,"affiliation":63,"orcid":63},"Sunghyun",{"paper_id":8732,"author_seq":203,"given_name":6678,"surname":6679,"affiliation":63,"orcid":63},"Recently, various datasets for question answering (QA) research have been released, such as SQuAD, Marco, WikiQA, MCTest, and SearchQA. However, such existing training resources for these task mostly support only English. In contrast, we study semi-automated creation of the Korean Question Answering Dataset (K-QuAD), by using automatically translated SQuAD and a QA system bootstrapped on a small QA pair set. As a naive approach for other language, using only machine-translated SQuAD shows limited performance due to translation errors. We study why such approach fails and motivate needs to build seed resources to enable leveraging such resources. Specifically, we annotate seed QA pairs of small size (4K) for Korean language, and design how such seed can be combined with translated English resources. These approach, by combining two resources, leads to 71.50 F1 on Korean QA (comparable to 77.3 F1 on SQuAD).",{"paper_id":8749,"title":8750,"year":81,"month":855,"day":63,"doi":8751,"resource_url":8752,"first_page":63,"last_page":63,"pdf_url":8753,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8754,"paper_type":860,"authors":8755,"abstract":8771},"lrec2018-main-438","PhotoshopQuiA: A Corpus of Non-Factoid Questions and Answers for Why-Question Answering","10.63317\u002F2qxky84mur43","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-438","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F758.pdf","dulceanu-etal-2018-photoshopquia",[8756,8758,8761,8762,8765,8767,8769],{"paper_id":8749,"author_seq":247,"given_name":2907,"surname":8757,"affiliation":63,"orcid":63},"Dulceanu",{"paper_id":8749,"author_seq":232,"given_name":8759,"surname":8760,"affiliation":63,"orcid":63},"Thang","Le Dinh",{"paper_id":8749,"author_seq":218,"given_name":8558,"surname":2069,"affiliation":63,"orcid":63},{"paper_id":8749,"author_seq":203,"given_name":8763,"surname":8764,"affiliation":63,"orcid":63},"Trung","Bui",{"paper_id":8749,"author_seq":188,"given_name":8766,"surname":1104,"affiliation":63,"orcid":63},"Doo Soon",{"paper_id":8749,"author_seq":172,"given_name":8768,"surname":8263,"affiliation":63,"orcid":63},"Manh Chien",{"paper_id":8749,"author_seq":155,"given_name":8770,"surname":1104,"affiliation":63,"orcid":63},"Seokhwan","Recent years have witnessed a high interest in non-factoid question answering using Community Question Answering (CQA) web sites. Despite ongoing research using state-of-the-art methods, there is a scarcity of available datasets for this task. Why-questions, which play an important role in open-domain and domain-specific applications, are difficult to answer automatically since the answers need to be constructed based on different information extracted from multiple knowledge sources. We introduce the PhotoshopQuiA dataset, a new publicly available set of 2,854 why-question and answer(s) (WhyQ, A) pairs related to Adobe Photoshop usage collected from five CQA web sites. We chose Adobe Photoshop because it is a popular and well-known product, with a lively, knowledgeable and sizeable community. To the best of our knowledge, this is the first English dataset for Why-QA that focuses on a product, as opposed to previous open-domain datasets. The corpus is stored in JSON format and contains detailed data about questions and questioners as well as answers and answerers. The dataset can be used to build Why-QA systems, to evaluate current approaches for answering why-questions, and to develop new models for future QA systems research.",{"paper_id":8773,"title":8774,"year":81,"month":855,"day":63,"doi":8775,"resource_url":8776,"first_page":63,"last_page":63,"pdf_url":8777,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8778,"paper_type":860,"authors":8779,"abstract":8786},"lrec2018-main-439","BioRead: A New Dataset for Biomedical Reading Comprehension","10.63317\u002F2k6jps9hh3ui","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-439","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F795.pdf","pappas-etal-2018-bioread",[8780,8782,8784],{"paper_id":8773,"author_seq":247,"given_name":3204,"surname":8781,"affiliation":63,"orcid":63},"Pappas",{"paper_id":8773,"author_seq":232,"given_name":5428,"surname":8783,"affiliation":63,"orcid":63},"Androutsopoulos",{"paper_id":8773,"author_seq":218,"given_name":7754,"surname":8785,"affiliation":63,"orcid":63},"Papageorgiou","We present BioRead, a new publicly available cloze-style biomedical machine reading comprehension (MRC) dataset with approximately 16.4 million passage-question instances. BioRead was constructed in the same way as the widely used Children’s Book Test and its extension BookTest, but using biomedical journal articles and employing MetaMap to identify UMLS concepts. BioRead is one of the largest MRC datasets, and currently the largest one in the biomedical domain. We also provide a subset of BioRead, BioReadLite, for research groups with fewer computational resources. We re-implemented and tested on BioReadLite two well-known MRC methods, AS Reader and AOA Reader, along with four baselines, as a first step towards a BioRead (and BioReadLite) leaderboard. AOA Reader is currently the best method on BioReadLite, with 51.19% test accuracy. Both AOA Reader and AS Reader outperform the baselines by a wide margin on the test subset of BioReadLite. Our re-implementations of the two MRC methods are also publicly available.",{"paper_id":8788,"title":8789,"year":81,"month":855,"day":63,"doi":8790,"resource_url":8791,"first_page":63,"last_page":63,"pdf_url":8792,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8793,"paper_type":860,"authors":8794,"abstract":8801},"lrec2018-main-440","MMQA: A Multi-domain Multi-lingual Question-Answering Framework for English and Hindi","10.63317\u002F34qh4vbtx4di","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-440","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F826.pdf","gupta-etal-2018-mmqa",[8795,8796,8799,8800],{"paper_id":8788,"author_seq":247,"given_name":5996,"surname":2603,"affiliation":63,"orcid":63},{"paper_id":8788,"author_seq":232,"given_name":8797,"surname":8798,"affiliation":63,"orcid":63},"Surabhi","Kumari",{"paper_id":8788,"author_seq":218,"given_name":1861,"surname":1862,"affiliation":63,"orcid":63},{"paper_id":8788,"author_seq":203,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},"In this paper, we assess the challenges for multi-domain, multi-lingual question answering, create necessary resources for benchmarking and develop a baseline model. We curate 500 articles in six different domains from the web. These articles form a comparable corpora of 250 English documents and 250 Hindi documents. From these comparable corpora, we have created 5; 495 question-answer pairs with the questions and answers, both being in English and Hindi. The question can be both factoid or short descriptive types. The answers are categorized in 6 coarse and 63 finer types. To the best of our knowledge, this is the very first attempt towards creating multi-domain, multi-lingual question answering evaluation involving English and Hindi. We develop a deep learning based model for classifying an input question into the coarse and finer categories depending upon the expected answer. Answers are extracted through similarity computation and subsequent ranking. For factoid question, we obtain an MRR value of 49:10% and for short descriptive question, we obtain a BLEU score of 41:37%. Evaluation of question classification model shows the accuracies of 90:12% and 80:30% for coarse and finer classes, respectively.",{"paper_id":8803,"title":8804,"year":81,"month":855,"day":63,"doi":8805,"resource_url":8806,"first_page":63,"last_page":63,"pdf_url":8807,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8808,"paper_type":860,"authors":8809,"abstract":8815},"lrec2018-main-441","The First 100 Days: A Corpus Of Political Agendas on Twitter","10.63317\u002F4wihq2skm3q6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-441","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F570.pdf","green-larasati-2018-first",[8810,8812],{"paper_id":8803,"author_seq":247,"given_name":5330,"surname":8811,"affiliation":63,"orcid":63},"Green",{"paper_id":8803,"author_seq":232,"given_name":8813,"surname":8814,"affiliation":63,"orcid":63},"Septina","Larasati","The first 100 days corpus is a curated corpus of the first 100 days of the United States of America's President and the Senate. During the first 100 days, the political parties in the USA try to push their agendas for the upcoming year under the new President. As communication has changed this is primarily being done on Twitter so that the President and Senators can communicate directly with their constituents. We analyzed the current President along with 100 Senators ranging the political spectrum to see the differences in their language usage. The creation of this corpus is intended to help Natural Language Processing (NLP) and Political Science research studying the changing political climate during a shift in power through language. To help accomplish this, the corpus is harvested and normalized in multiple formats. As well, we include gold standard part-of-speech tags for selected individuals including the President. Through analysis of the text, a clear distinction between political parties can be found. This analysis shows the important item of their political agendas during the first 100 days of a new party in power.",{"paper_id":8817,"title":8818,"year":81,"month":855,"day":63,"doi":8819,"resource_url":8820,"first_page":63,"last_page":63,"pdf_url":8821,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8822,"paper_type":860,"authors":8823,"abstract":8832},"lrec2018-main-442","Medical Sentiment Analysis using Social Media: Towards building a Patient Assisted System","10.63317\u002F2c4frtq8pjbg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-442","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F577.pdf","yadav-etal-2018-medical",[8824,8827,8828,8831],{"paper_id":8817,"author_seq":247,"given_name":8825,"surname":8826,"affiliation":63,"orcid":63},"Shweta","Yadav",{"paper_id":8817,"author_seq":232,"given_name":1861,"surname":1862,"affiliation":63,"orcid":63},{"paper_id":8817,"author_seq":218,"given_name":8829,"surname":8830,"affiliation":63,"orcid":63},"Sriparna","Saha",{"paper_id":8817,"author_seq":203,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},"With the enormous growth of Internet, more users have engaged in health communities such as medical forums to gather health-related information, to share experiences about drugs, treatments, diagnosis or to interact with other users with the similar condition in communities. Monitoring social media platforms has recently fascinated medical natural language processing researchers to detect various medical abnormalities such as adverse drug reaction. In this paper, we present a benchmark setup for analyzing the sentiment with respect to users’ medical condition considering the information, available in social media in particular. To this end, we have crawled the medical forum website ‘patient.info’ with opinions about medical condition self-narrated by the users. We constrained ourselves to some of the popular domains such as depression, anxiety, asthma, and allergy. The focus is given on the identification of multiple forms of medical sentiments which can be inferred from users’ medical condition, treatment, and medication. Thereafter, a deep Convolutional Neural Network (CNN) based medical sentiment analysis system is developed for the purpose of evaluation. The resources are made available to the community through LRE map for further research.",{"paper_id":8834,"title":8835,"year":81,"month":855,"day":63,"doi":8836,"resource_url":8837,"first_page":63,"last_page":63,"pdf_url":8838,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8839,"paper_type":860,"authors":8840,"abstract":8850},"lrec2018-main-443","An Italian Twitter Corpus of Hate Speech against Immigrants","10.63317\u002F26vnss59hfcq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-443","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F710.pdf","sanguinetti-etal-2018-italian",[8841,8842,8844,8845,8848],{"paper_id":8834,"author_seq":247,"given_name":2539,"surname":6009,"affiliation":63,"orcid":63},{"paper_id":8834,"author_seq":232,"given_name":6021,"surname":8843,"affiliation":63,"orcid":63},"Poletto",{"paper_id":8834,"author_seq":218,"given_name":6011,"surname":6012,"affiliation":63,"orcid":63},{"paper_id":8834,"author_seq":203,"given_name":8846,"surname":8847,"affiliation":63,"orcid":63},"Viviana","Patti",{"paper_id":8834,"author_seq":188,"given_name":922,"surname":8849,"affiliation":63,"orcid":63},"Stranisci","The paper describes a recently-created Twitter corpus of about 6,000 tweets, annotated for hate speech against immigrants, and developed to be a reference dataset for an automatic system of hate speech monitoring. The annotation scheme was therefore specifically designed to account for the multiplicity of factors that can contribute to the definition of a hate speech notion, and to offer a broader tagset capable of better representing all those factors, which may increase, or rather mitigate, the impact of the message. This resulted in a scheme that includes, besides hate speech, the following categories: aggressiveness, offensiveness, irony, stereotype, and (on an experimental basis) intensity. The paper hereby presented namely focuses on how this annotation scheme was designed and applied to the corpus. In particular, also comparing the annotation produced by CrowdFlower contributors and by expert annotators, we make some remarks about the value of the novel resource as gold standard, which stems from a preliminary qualitative analysis of the annotated data and on future corpus development.",{"paper_id":8852,"title":8853,"year":81,"month":855,"day":63,"doi":8854,"resource_url":8855,"first_page":63,"last_page":63,"pdf_url":8856,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8857,"paper_type":860,"authors":8858,"abstract":8866},"lrec2018-main-444","A Large Multilingual and Multi-domain Dataset for Recommender Systems","10.63317\u002F2qmmtegbr64c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-444","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F858.pdf","di-tommaso-etal-2018-large",[8859,8862,8863],{"paper_id":8852,"author_seq":247,"given_name":8860,"surname":8861,"affiliation":63,"orcid":63},"Giorgia","Di Tommaso",{"paper_id":8852,"author_seq":232,"given_name":2733,"surname":2734,"affiliation":63,"orcid":63},{"paper_id":8852,"author_seq":218,"given_name":8864,"surname":8865,"affiliation":63,"orcid":63},"Paola","Velardi","This paper presents a multi-domain interests dataset to train and test Recommender Systems, and the methodology to create the dataset from Twitter messages in English and Italian. The English dataset includes an average of 90 preferences per user on music, books, movies, celebrities, sport, politics and much more, for about half million users. Preferences are either extracted from messages of users who use Spotify, Goodreads and other similar content sharing platforms, or induced from their ”topical” friends, i.e., followees representing an interest rather than a social relation between peers. In addition, preferred items are matched with Wikipedia articles describing them. This unique feature of our dataset provides a mean to derive a semantic categorization of the preferred items, exploiting available semantic resources linked to Wikipedia such as the Wikipedia Category Graph, DBpedia, BabelNet and others.",{"paper_id":8868,"title":8869,"year":81,"month":855,"day":63,"doi":8870,"resource_url":8871,"first_page":63,"last_page":63,"pdf_url":8872,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8873,"paper_type":860,"authors":8874,"abstract":8886},"lrec2018-main-445","RtGender: A Corpus for Studying Differential Responses to Gender","10.63317\u002F3di79eiki43x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-445","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F903.pdf","voigt-etal-2018-rtgender",[8875,8877,8879,8882,8883],{"paper_id":8868,"author_seq":247,"given_name":3011,"surname":8876,"affiliation":63,"orcid":63},"Voigt",{"paper_id":8868,"author_seq":232,"given_name":1199,"surname":8878,"affiliation":63,"orcid":63},"Jurgens",{"paper_id":8868,"author_seq":218,"given_name":8880,"surname":8881,"affiliation":63,"orcid":63},"Vinodkumar","Prabhakaran",{"paper_id":8868,"author_seq":203,"given_name":2383,"surname":4238,"affiliation":63,"orcid":63},{"paper_id":8868,"author_seq":188,"given_name":8884,"surname":8885,"affiliation":63,"orcid":63},"Yulia","Tsvetkov","Like many social variables, gender pervasively influences how people communicate with one another.  However, prior computational work has largely focused on linguistic gender difference and communications about gender, rather than communications directed to people of that gender, in part due to lack of data.  Here, we fill a critical need by introducing a multi-genre corpus of more than 25M comments from five socially and topically diverse sources tagged for the gender of the addressee.  Using these data, we describe pilot studies on how differential responses to gender can be measured and analyzed and present 30k annotations for the sentiment and relevance of these responses, showing that across our datasets responses to women are more likely to be emotive and about the speaker as an individual (rather than about the content being responded to).  Our dataset enables studying socially important questions like gender bias, and has potential uses for downstream applications such as dialogue systems, gender detection or obfuscation, and debiasing language generation.",{"paper_id":8888,"title":8889,"year":81,"month":855,"day":63,"doi":8890,"resource_url":8891,"first_page":63,"last_page":63,"pdf_url":8892,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8893,"paper_type":860,"authors":8894,"abstract":8898},"lrec2018-main-446","A Neural Network Model for Part-Of-Speech Tagging of Social Media Texts","10.63317\u002F5d29u4hk8efv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-446","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F913.pdf","meftah-semmar-2018-neural",[8895,8897],{"paper_id":8888,"author_seq":247,"given_name":2650,"surname":8896,"affiliation":63,"orcid":63},"Meftah",{"paper_id":8888,"author_seq":232,"given_name":1825,"surname":1826,"affiliation":63,"orcid":63},"In this paper, we propose a neural network model for Part-Of-Speech (POS) tagging of User-Generated Content (UGC) such as Twitter, Facebook and Web forums. The proposed model is end-to-end and uses both character and word level representations. Character level representations are learned during the training of the model through a Convolutional Neural Network (CNN). For word level representations, we combine several pre-trainned embeddings (Word2Vec, FastText and GloVe). To deal with the issue of the poor availability of annotated social media data, we have implemented a Transfer Learning (TL) approach. We demonstrate the validity and genericity of our model on a POS tagging task by conducting our experiments on five social media languages (English, German, French, Italian and Spanish).",{"paper_id":8900,"title":8901,"year":81,"month":855,"day":63,"doi":8902,"resource_url":8903,"first_page":63,"last_page":63,"pdf_url":8904,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8905,"paper_type":860,"authors":8906,"abstract":8916},"lrec2018-main-447","Utilizing Large Twitter Corpora to Create Sentiment Lexica","10.63317\u002F33kcb4ixr4hd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-447","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1028.pdf","fredriksen-etal-2018-utilizing",[8907,8910,8913],{"paper_id":8900,"author_seq":247,"given_name":8908,"surname":8909,"affiliation":63,"orcid":63},"Valerij","Fredriksen",{"paper_id":8900,"author_seq":232,"given_name":8911,"surname":8912,"affiliation":63,"orcid":63},"Brage","Jahren",{"paper_id":8900,"author_seq":218,"given_name":8914,"surname":8915,"affiliation":63,"orcid":63},"Björn","Gambäck","The paper describes an automatic Twitter sentiment lexicon creator and a lexicon-based sentiment analysis system. The lexicon creator is based on a Pointwise Mutual Information approach, utilizing 6.25 million automatically labeled tweets and 103 million unlabeled, with the created lexicon consisting of about 3 000 entries. In a comparison experiment, this lexicon beat a manually annotated lexicon. A sentiment analysis system utilizing the created lexicon, and handling both negation and intensification, produces results almost on par with sophisticated machine learning-based systems, while significantly outperforming those in terms of run-time.",{"paper_id":8918,"title":8919,"year":81,"month":855,"day":63,"doi":8920,"resource_url":8921,"first_page":63,"last_page":63,"pdf_url":8922,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8923,"paper_type":860,"authors":8924,"abstract":8929},"lrec2018-main-448","The Nautilus Speaker Characterization Corpus: Speech Recordings and Labels of Speaker Characteristics and Voice Descriptions","10.63317\u002F2xig3hmcwfki","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-448","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F24.pdf","fernandez-gallardo-weiss-2018-nautilus",[8925,8927],{"paper_id":8918,"author_seq":247,"given_name":1172,"surname":8926,"affiliation":63,"orcid":63},"Fernández Gallardo",{"paper_id":8918,"author_seq":232,"given_name":2202,"surname":8928,"affiliation":63,"orcid":63},"Weiss","The Nautilus Speaker Characterization corpus is presented. It comprises conversational microphone speech recordings from 300 German speakers (126 males and 174 females) made in 2016\u002F2017 in the acoustically-isolated room Nautilus of the Quality and Usability Lab of the Technische Universität Berlin, Germany. Four scripted and four semi-spontaneous dialogs were elicited from the speakers, simulating telephone call inquiries. Additionally, other spontaneous neutral and emotional speech utterances and questions were produced. Interactions between speakers and their interlocutor (who also conducted the recording session) are provided in separate mono files, accompanied by timestamps and tags that define the speaker's turns. One of the recorded semi-spontanous dialogs has been labeled by external assessors on 34 interpersonal speaker characteristics for each speaker, employing continous sliders. Additionally, 20 selected speakers have been labeled on 34 naive voice descriptions. The corpus labels permit to investigate the speech features that contribute to human perceptions and automatic recognition of speaker social characteristics and interpersonal traits.",{"paper_id":8931,"title":8932,"year":81,"month":855,"day":63,"doi":8933,"resource_url":8934,"first_page":63,"last_page":63,"pdf_url":8935,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8936,"paper_type":860,"authors":8937,"abstract":8941},"lrec2018-main-449","Evaluation of Automatic Formant Trackers","10.63317\u002F2jdmn2nnuwo2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-449","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F28.pdf","schiel-zitzelsberger-2018-evaluation",[8938,8939],{"paper_id":8931,"author_seq":247,"given_name":6048,"surname":6049,"affiliation":63,"orcid":63},{"paper_id":8931,"author_seq":232,"given_name":1615,"surname":8940,"affiliation":63,"orcid":63},"Zitzelsberger","Four open source formant trackers, three LPC-based and one based on Deep Learning, were evaluated on the same American English data set VTR-TIMIT. Test data were time-synchronized to avoid differences due to different unvoiced\u002Fvoiced detection strategies. Default output values of trackers (e.g. producing 500Hz for the first formant, 1500Hz for the second etc.) were filtered from the evaluation data to avoid biased results. Evaluations were performed on the total recording and on three American English vowels [i:], [u] and [ʌ] separately. The obtained quality measures showed that all three LPC-based trackers had comparable RSME error results that are about 2 times the inter-labeller error of human labellers. Tracker results were biased considerably (in average too high or low), when the parameter settings of the tracker were not adjusted to the speaker's sex. Deep Learning appeared to outperform LPC-based trackers in general, but not in vowels. Deep Learning has the disadvantage that it requires annotated training material from the same speech domain as the target speech, and a trained Deep Learning tracker is therefore not applicable to other languages.",{"paper_id":8943,"title":8944,"year":81,"month":855,"day":63,"doi":8945,"resource_url":8946,"first_page":63,"last_page":63,"pdf_url":8947,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8948,"paper_type":860,"authors":8949,"abstract":8962},"lrec2018-main-450","Design and Development of Speech Corpora for Air Traffic Control Training","10.63317\u002F43donw5cdqki","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-450","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F41.pdf","smidl-etal-2018-design",[8950,8953,8954,8956,8959,8961],{"paper_id":8943,"author_seq":247,"given_name":8951,"surname":8952,"affiliation":63,"orcid":63},"Luboš","Šmídl",{"paper_id":8943,"author_seq":232,"given_name":2633,"surname":6931,"affiliation":63,"orcid":63},{"paper_id":8943,"author_seq":218,"given_name":2554,"surname":8955,"affiliation":63,"orcid":63},"Tihelka",{"paper_id":8943,"author_seq":203,"given_name":8957,"surname":8958,"affiliation":63,"orcid":63},"Jindřich","Matoušek",{"paper_id":8943,"author_seq":188,"given_name":2633,"surname":8960,"affiliation":63,"orcid":63},"Romportl",{"paper_id":8943,"author_seq":172,"given_name":4670,"surname":6918,"affiliation":63,"orcid":63},"The paper describes the process of creation of domain-specific speech corpora containing air traffic control (ATC) communication prompts. Since the ATC domain is highly specific both from the acoustic point-of-view (significant level of noise in the signal, non-native English accents of the speakers, non-standard pronunciation of some frequent words) and the lexical and syntactic perspective (prescribed structure of utterances, rather limited vocabulary), it is useful to collect and annotate data from this specific domain. Actually, the ultimate goal of the research effort of our team was to develop a voice dialogue system simulating the responses of the pilot that could be used for training aspiring air traffic controllers. In order to do so, we needed - among other modules - a domain-specific automatic speech recognition (ASR) and text-to-speech synthesis (TTS) engines. This paper concentrates on the details of the ASR and TTS corpora creation process but also overviews their usage in preparing practical applications and provides links to the distribution channel of the data.",{"paper_id":8964,"title":8965,"year":81,"month":855,"day":63,"doi":8966,"resource_url":8967,"first_page":63,"last_page":63,"pdf_url":8968,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8969,"paper_type":860,"authors":8970,"abstract":8976},"lrec2018-main-451","A First South African Corpus of Multilingual Code-switched Soap Opera Speech","10.63317\u002F5j7f6ra8mxhn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-451","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F90.pdf","van-der-westhuizen-niesler-2018-first",[8971,8974],{"paper_id":8964,"author_seq":247,"given_name":8972,"surname":8973,"affiliation":63,"orcid":63},"Ewald","van der Westhuizen",{"paper_id":8964,"author_seq":232,"given_name":1615,"surname":8975,"affiliation":63,"orcid":63},"Niesler","We introduce a speech corpus containing multilingual code-switching compiled from South African soap operas. The corpus contains English, isiZulu, isiXhosa, Setswana and Sesotho speech, paired into four language-balanced subcorpora containing English-isiZulu, English-isiXhosa, English-Setswana and English-Sesotho. In total, the corpus contains 14.3 hours of annotated and segmented speech. The soap opera speech is typically fast, spontaneous and may express emotion, with a speech rate that is between 1.22 and 1.83 times higher than prompted speech in the same languages. Among the 10343 code-switched utterances in the corpus, 19207 intrasentential language switches are observed. Insertional code-switching with English words is observed to be most frequent. Intraword code-switching, where English words are supplemented with Bantu affixes in an effort to conform to Bantu phonology, is also observed. Most bigrams containing code-switching occur only once, making up between 64% and 92% of such bigrams in each subcorpus.",{"paper_id":8978,"title":8979,"year":81,"month":855,"day":63,"doi":8980,"resource_url":8981,"first_page":63,"last_page":63,"pdf_url":8982,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8983,"paper_type":860,"authors":8984,"abstract":8988},"lrec2018-main-452","A Web Service for Pre-segmenting Very Long Transcribed Speech Recordings","10.63317\u002F3y87a3dynnw6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-452","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F105.pdf","poerner-schiel-2018-web",[8985,8987],{"paper_id":8978,"author_seq":247,"given_name":8535,"surname":8986,"affiliation":63,"orcid":63},"Poerner",{"paper_id":8978,"author_seq":232,"given_name":6048,"surname":6049,"affiliation":63,"orcid":63},"The run time of classical text-to-speech alignment algorithms tends to grow quadratically with the length of the input. This makes it difficult to apply them to very long speech recordings. In this paper, we describe and evaluate two algorithms that pre-segment long recordings into manageable \"chunks\". The first algorithm is fast but cannot guarantee short chunks on noisy recordings or erroneous transcriptions. The second algorithm reliably delivers short chunks but is less effective in terms of run time and chunk boundary accuracy. We show that both algorithms reduce the run time of the MAUS speech segmentation system to under real-time, even on recordings that could not previously be processed. Evaluation on real-world recordings in three different languages shows that the majority of chunk boundaries obtained with the proposed methods deviate less than 100 ms from a ground truth segmentation. On a separate German studio quality recording, MAUS word segmentation accuracy was slightly improved by both algorithms. The chunking service is freely accessible via a web API in the CLARIN infrastructure, and currently supports 33 languages and dialects.",{"paper_id":8990,"title":8991,"year":81,"month":855,"day":63,"doi":8992,"resource_url":8993,"first_page":63,"last_page":63,"pdf_url":8994,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":8995,"paper_type":860,"authors":8996,"abstract":9017},"lrec2018-main-453","A Real-life, French-accented Corpus of Air Traffic Control Communications","10.63317\u002F5medhwrjr6ju","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-453","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F108.pdf","delpech-etal-2018-real",[8997,9000,9003,9006,9009,9011,9014],{"paper_id":8990,"author_seq":247,"given_name":8998,"surname":8999,"affiliation":63,"orcid":63},"Estelle","Delpech",{"paper_id":8990,"author_seq":232,"given_name":9001,"surname":9002,"affiliation":63,"orcid":63},"Marion","Laignelet",{"paper_id":8990,"author_seq":218,"given_name":9004,"surname":9005,"affiliation":63,"orcid":63},"Christophe","Pimm",{"paper_id":8990,"author_seq":203,"given_name":9007,"surname":9008,"affiliation":63,"orcid":63},"Céline","Raynal",{"paper_id":8990,"author_seq":188,"given_name":1605,"surname":9010,"affiliation":63,"orcid":63},"Trzos",{"paper_id":8990,"author_seq":172,"given_name":9012,"surname":9013,"affiliation":63,"orcid":63},"Alexandre","Arnold",{"paper_id":8990,"author_seq":155,"given_name":9015,"surname":9016,"affiliation":63,"orcid":63},"Dominique","Pronto","This paper describes the creation  of the AIRBUS-ATC corpus, which is a real-life, French-accented speech corpus of Air Traffic Control (ATC) communications (message exchanged between pilots and controllers) intended to build a robust ATC speech recognition engine. The corpus is currently composed of 59 hours of transcribed English audio, along with linguistic and meta-data annotations. It is intended to reach 100 hours by the end of the project. We describe ATC speech specificities, how the audio is collected, transcribed and what techniques were used to ensure transcription quality while limiting transcription costs. A detailed description of the corpus content (speaker gender, accent, role, type of control, speech turn duration) is given. Finally, preliminary results obtained with state-of-the-art speech recognition techniques support the idea that accent-specific corpora will play a pivotal role in building robust ATC speech recognition applications.",{"paper_id":9019,"title":9020,"year":81,"month":855,"day":63,"doi":9021,"resource_url":9022,"first_page":63,"last_page":63,"pdf_url":9023,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9024,"paper_type":860,"authors":9025,"abstract":9029},"lrec2018-main-454","Creating Lithuanian and Latvian Speech Corpora from Inaccurately Annotated Web Data","10.63317\u002F3xqm99aw79qe","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-454","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F258.pdf","salimbajevs-2018-creating",[9026],{"paper_id":9019,"author_seq":247,"given_name":9027,"surname":9028,"affiliation":63,"orcid":63},"Askars","Salimbajevs","This paper describes the method that was used to produce additional acoustic model training data for the less-resourced languages of Lithuanian and Latvian. The method uses existing baseline speech recognition systems for Latvian and Lithuanian to align audio data from the Web with imprecise non-normalised transcripts.  From 690 hours of Web data (300h for Latvian, 390h for Lithuanian), we have created additional 378 hours of training data (186h for Latvian and 192 for Lithuanian). Combining this additional data with baseline training data allowed to significantly improve word error rate for Lithuanian from 40% to 23%. Word error rate for the Latvian system was improved from 19% to 17%.",{"paper_id":9031,"title":9032,"year":81,"month":855,"day":63,"doi":9033,"resource_url":9034,"first_page":63,"last_page":63,"pdf_url":9035,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9036,"paper_type":860,"authors":9037,"abstract":9050},"lrec2018-main-455","Discovering Canonical Indian English Accents: A Crowdsourcing-based Approach","10.63317\u002F2r92dmgdmnua","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-455","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F279.pdf","sitaram-etal-2018-discovering",[9038,9041,9044,9046,9047,9048],{"paper_id":9031,"author_seq":247,"given_name":9039,"surname":9040,"affiliation":63,"orcid":63},"Sunayana","Sitaram",{"paper_id":9031,"author_seq":232,"given_name":9042,"surname":9043,"affiliation":63,"orcid":63},"Varun","Manjunath",{"paper_id":9031,"author_seq":218,"given_name":9042,"surname":9045,"affiliation":63,"orcid":63},"Bharadwaj",{"paper_id":9031,"author_seq":203,"given_name":5599,"surname":5600,"affiliation":63,"orcid":63},{"paper_id":9031,"author_seq":188,"given_name":5602,"surname":5603,"affiliation":63,"orcid":63},{"paper_id":9031,"author_seq":172,"given_name":1780,"surname":9049,"affiliation":63,"orcid":63},"Tjalve","Automatic Speech Recognition (ASR) systems typically degrade in performance when recognizing an accent different from the accents in the training data. One way to overcome this problem without training new models for every accent is adaptation. India has over a hundred major languages, which leads to many variants in Indian English accents. Making an ASR system work well for Indian English would involve collecting data for all representative accents in Indian English and then adapting Acoustic Models for each of those accents. However, given the number of languages that exist in India and the lack of a prior work in literature about how many Indian English accents exist, it is difﬁcult to come up with a set of canonical accents that could sufﬁciently capture the variations observed in Indian English. In addition, there is a lack of labeled corpora of accents in Indian English. We approach the problem of determining a set of canonical Indian English accents by taking a crowdsourcing based approach. We conduct a mobile app based user study in which we play audio samples collected from all over India and ask users to identify the geographical origin of the speaker. We measure the consensus among users to come up with a set of candidate accents in Indian English and identify which accents are best recognized and which ones are confusable. We extend our preliminary user study to a web app-based study that can potentially generate more labeled data for Indian English accents. We describe results and challenges encountered in a pilot study conducted using the web-app and future work to scale up the study.",{"paper_id":9052,"title":9053,"year":81,"month":855,"day":63,"doi":9054,"resource_url":9055,"first_page":63,"last_page":63,"pdf_url":9056,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9057,"paper_type":860,"authors":9058,"abstract":9076},"lrec2018-main-456","Extending Search System based on Interactive Visualization for Speech Corpora","10.63317\u002F5bkbydyebsiy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-456","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F311.pdf","ohsuga-etal-2018-extending",[9059,9062,9065,9067,9070,9073],{"paper_id":9052,"author_seq":247,"given_name":9060,"surname":9061,"affiliation":63,"orcid":63},"Tomoko","Ohsuga",{"paper_id":9052,"author_seq":232,"given_name":9063,"surname":9064,"affiliation":63,"orcid":63},"Yuichi","Ishimoto",{"paper_id":9052,"author_seq":218,"given_name":9060,"surname":9066,"affiliation":63,"orcid":63},"Kajiyama",{"paper_id":9052,"author_seq":203,"given_name":9068,"surname":9069,"affiliation":63,"orcid":63},"Shunsuke","Kozawa",{"paper_id":9052,"author_seq":188,"given_name":9071,"surname":9072,"affiliation":63,"orcid":63},"Kiyotaka","Uchimoto",{"paper_id":9052,"author_seq":172,"given_name":9074,"surname":9075,"affiliation":63,"orcid":63},"Shuichi","Itahashi","This paper describes a search system that we have developed specifically for speech corpus retrieval. It is difficult for speech corpus users to compare and select suitable corpora from the large number of various language resources in the world. It would be more convenient for users if each data center used a common specification system for describing its corpora. With the “Concentric Ring View (CRV) System” we proposed, users can search for speech corpora interactively and visually by utilizing the attributes peculiar to speech corpora. We have already proposed a set of specification attributes and items as the first step towards standardization, and we have added these attributes and items to the large-scale metadata database “SHACHI”, then we connected SHACHI to the CRV system and implemented it as a combined speech corpus search system.",{"paper_id":9078,"title":9079,"year":81,"month":855,"day":63,"doi":9080,"resource_url":9081,"first_page":63,"last_page":63,"pdf_url":9082,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9083,"paper_type":860,"authors":9084,"abstract":9098},"lrec2018-main-457","German Radio Interviews: The GRAIN Release of the SFB732 Silver Standard Collection","10.63317\u002F5iio37a8q3nx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-457","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F320.pdf","schweitzer-etal-2018-german",[9085,9088,9089,9090,9091,9092,9093,9095,9097],{"paper_id":9078,"author_seq":247,"given_name":9086,"surname":9087,"affiliation":63,"orcid":63},"Katrin","Schweitzer",{"paper_id":9078,"author_seq":232,"given_name":7189,"surname":7190,"affiliation":63,"orcid":63},{"paper_id":9078,"author_seq":218,"given_name":1771,"surname":2668,"affiliation":63,"orcid":63},{"paper_id":9078,"author_seq":203,"given_name":7186,"surname":7187,"affiliation":63,"orcid":63},{"paper_id":9078,"author_seq":188,"given_name":6478,"surname":6479,"affiliation":63,"orcid":63},{"paper_id":9078,"author_seq":172,"given_name":2002,"surname":2003,"affiliation":63,"orcid":63},{"paper_id":9078,"author_seq":155,"given_name":9094,"surname":9087,"affiliation":63,"orcid":63},"Antje",{"paper_id":9078,"author_seq":138,"given_name":6728,"surname":9096,"affiliation":63,"orcid":63},"Stehwien",{"paper_id":9078,"author_seq":121,"given_name":1278,"surname":4138,"affiliation":63,"orcid":63},"We present GRAIN (German RAdio INterviews) as part of the SFB732 Silver Standard Collection. GRAIN contains German radio interviews and is annotated on multiple linguistic layers. The data has been processed with state-of-the-art tools for text and speech and therefore represents a resource for text-based linguistic research as well as speech science. While there is a gold standard part with manual annotations, the (much larger) silver standard part (which is growing as the radio station releases more interviews) relies completely on automatic annotations. We explicitly release different versions of annotations for the same layers (e.g. morpho-syntax) with the aim to combine and compare multiple layers in order to derive confidence estimations for the annotations. Therefore, parts of the data where the output of several tools match can be considered clear-cut cases, while mismatches hint at areas of interest which are potentially challenging or where rare phenomena can be found.",{"paper_id":9100,"title":9101,"year":81,"month":855,"day":63,"doi":9102,"resource_url":9103,"first_page":63,"last_page":63,"pdf_url":9104,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9105,"paper_type":860,"authors":9106,"abstract":9112},"lrec2018-main-458","Preparing Data from Psychotherapy for Natural Language Processing","10.63317\u002F2dkm97ff6ken","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-458","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F418.pdf","mieskes-stiegelmayr-2018-preparing",[9107,9110],{"paper_id":9100,"author_seq":247,"given_name":9108,"surname":9109,"affiliation":63,"orcid":63},"Margot","Mieskes",{"paper_id":9100,"author_seq":232,"given_name":3238,"surname":9111,"affiliation":63,"orcid":63},"Stiegelmayr","Mental health and well-being are growing issues in western civilizations. But at the same time, psychotherapy and further education in psychotherapy is a highly demanding occupation, resulting in a severe gap in patient-centered care. The question which arises from recent developments in natural language processing (NLP) and speech recognition is, how these technologies could be employed to support the therapists in their work and allow for a better treatment of patients. Most research in NLP focuses on analysing the language of patients with various psychological conditions, but only few examples exist that analyse the therapists behavior and the interaction between therapist and patient. We present ongoing work in collecting, preparing and analysing data from psychotherapy sessions together with expert annotations on various qualitative dimensions of these sessions, such as feedback and cooperation. Our aim is to use this data in a classification task, which gives insight into what qualifies for good feedback or cooperation in therapy sessions and employ this information to support psychotherapists in improving the quality of the care they offer.",{"paper_id":9114,"title":9115,"year":81,"month":855,"day":63,"doi":9116,"resource_url":9117,"first_page":63,"last_page":63,"pdf_url":9118,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9119,"paper_type":860,"authors":9120,"abstract":9128},"lrec2018-main-459","MirasVoice: A bilingual (English-Persian) speech corpus","10.63317\u002F53h3mag8tg9y","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-459","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F443.pdf","vaheb-etal-2018-mirasvoice",[9121,9122,9123,9124,9127],{"paper_id":9114,"author_seq":247,"given_name":1797,"surname":4345,"affiliation":63,"orcid":63},{"paper_id":9114,"author_seq":232,"given_name":2863,"surname":4340,"affiliation":63,"orcid":63},{"paper_id":9114,"author_seq":218,"given_name":4342,"surname":4343,"affiliation":63,"orcid":63},{"paper_id":9114,"author_seq":203,"given_name":9125,"surname":9126,"affiliation":63,"orcid":63},"Saeid","Safavi",{"paper_id":9114,"author_seq":188,"given_name":4189,"surname":4335,"affiliation":63,"orcid":63},"Speech and speaker recognition is one of the most important research and development areas and has received quite a lot of attention in recent years. The desire to produce a natural form of communication between humans and machines can be considered the motivating factor behind such developments. Speech has the potential to influence numerous fields of research and development. In this paper, MirasVoice which is a bilingual (English-Farsi) speech corpus is presented. Over 50 native Iranian speakers who were able to speak in both the Farsi and English languages have volunteered to help create this bilingual corpus. The volunteers read text documents and then had to answer questions spontaneously in both English and Farsi. The text-independent GMM-UBM speaker verification engine was designed in this study for validating and exploring the performance of this corpus. This multilingual speech corpus could be used in a variety of language dependent and independent applications. For example, it can be used to investigate the effects of different languages (Farsi and English) on the performance of speaker verification systems. The authors of this paper have also investigated speaker verification systems performances when using different train\u002Ftest architectures.",{"paper_id":9130,"title":9131,"year":81,"month":855,"day":63,"doi":9132,"resource_url":9133,"first_page":63,"last_page":63,"pdf_url":9134,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9135,"paper_type":860,"authors":9136,"abstract":9142},"lrec2018-main-460","Dialog Intent Structure: A Hierarchical Schema of Linked Dialog Acts","10.63317\u002F24qyij4nv6um","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-460","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F334.pdf","pareti-lando-2018-dialog",[9137,9140],{"paper_id":9130,"author_seq":247,"given_name":9138,"surname":9139,"affiliation":63,"orcid":63},"Silvia","Pareti",{"paper_id":9130,"author_seq":232,"given_name":7109,"surname":9141,"affiliation":63,"orcid":63},"Lando","In this paper, we present a new hierarchical and extensible schema for dialog representation. The schema captures the pragmatic intents of the conversation independently from any semantic representation. This schema was developed to support computational applications, be applicable to different types of dialogs and domains and enable large-scale non-expert annotation. The schema models dialog as a structure of linked units of intent, dialog acts, that are annotated on minimal spans of text, functional segments. Furthermore, we categorise dialog acts based on whether they express a primary or secondary intent and whether the intent is explicit or implicit. We successfully tested the schema on an heterogeneous corpus of human-human dialogs comprising both spoken and chat interactions.",{"paper_id":9144,"title":9145,"year":81,"month":855,"day":63,"doi":9146,"resource_url":9147,"first_page":63,"last_page":63,"pdf_url":9148,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9149,"paper_type":860,"authors":9150,"abstract":9153},"lrec2018-main-461","JDCFC: A Japanese Dialogue Corpus with Feature Changes","10.63317\u002F5627yrxpwh73","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-461","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F346.pdf","nakamura-kawahara-2018-jdcfc",[9151,9152],{"paper_id":9144,"author_seq":247,"given_name":4999,"surname":4455,"affiliation":63,"orcid":63},{"paper_id":9144,"author_seq":232,"given_name":1879,"surname":1880,"affiliation":63,"orcid":63},"In recent years, the importance of dialogue understanding systems has been increasing. However, it is difficult for computers to deeply understand our daily conversations because we frequently use emotional expressions in conversations. This is partially because there are no large-scale corpora focusing on the detailed relationships between emotions and utterances. In this paper, we propose a dialogue corpus constructed based on our knowledge base, called the Japanese Feature Change Knowledge Base (JFCKB). In JFCKB and the proposed corpus, the feature changes (mainly emotions) of arguments in event sentences (or utterances) and those of the event sentence recognizers (or utterance recognizers) are associated with the event sentences (or utterances). The feature change information of arguments in utterances and those of the utterance recognizers, replies to the utterances, and the reasonableness of the replies were gathered through crowdsourcing tasks. We conducted an experiment to investigate whether a machine learning method can recognize the reasonableness of a given conversation. Experimental result suggested the usefulness of our proposed corpus.",{"paper_id":9155,"title":9156,"year":81,"month":855,"day":63,"doi":9157,"resource_url":9158,"first_page":63,"last_page":63,"pdf_url":9159,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9160,"paper_type":860,"authors":9161,"abstract":9170},"lrec2018-main-462","Japanese Dialogue Corpus of Information Navigation and Attentive Listening Annotated with Extended ISO-24617-2 Dialogue Act Tags","10.63317\u002F5odim9cyitib","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-462","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F464.pdf","yoshino-etal-2018-japanese",[9162,9163,9165,9167,9169],{"paper_id":9155,"author_seq":247,"given_name":4441,"surname":4442,"affiliation":63,"orcid":63},{"paper_id":9155,"author_seq":232,"given_name":9164,"surname":6146,"affiliation":63,"orcid":63},"Hiroki",{"paper_id":9155,"author_seq":218,"given_name":9166,"surname":5475,"affiliation":63,"orcid":63},"Kyoshiro",{"paper_id":9155,"author_seq":203,"given_name":4105,"surname":9168,"affiliation":63,"orcid":63},"Kondo",{"paper_id":9155,"author_seq":188,"given_name":4454,"surname":4455,"affiliation":63,"orcid":63},"Large-scale dialogue data annotated with dialogue states is necessary to model a natural conversation with machines. However, large-scale conventional dialogue corpora are mainly built for specified tasks (e.g., task-oriented systems for restaurant or bus information navigation) with specially designed dialogue states. Text-chat based dialogue corpora have also been built due to the growth of social communication through the internet; however, most of them do not reflect dialogue behaviors in face-to-face conversation, including backchannelings or interruptions. In this paper, we try to build a corpus that covers a wider range of dialogue tasks than existing task-oriented systems or text-chat systems, by transcribing face-to-face dialogues held in natural conversational situations in tasks of information navigation and attentive listening. The corpus is recorded in Japanese and annotated with an extended ISO-24617-2 dialogue act tag-set, which is defined to see behaviors in natural conversation. The developed data can be used to build a dialogue model based on the ISO-24617-2 dialogue act tags.\\\\",{"paper_id":9172,"title":9173,"year":81,"month":855,"day":63,"doi":9174,"resource_url":9175,"first_page":63,"last_page":63,"pdf_url":9176,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9177,"paper_type":860,"authors":9178,"abstract":9195},"lrec2018-main-463","The Niki and Julie Corpus: Collaborative Multimodal Dialogues between Humans, Robots, and Virtual Agents","10.63317\u002F5m3sw35yfkrx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-463","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F482.pdf","artstein-etal-2018-niki",[9179,9180,9183,9186,9188,9189,9192,9194],{"paper_id":9172,"author_seq":247,"given_name":1208,"surname":1209,"affiliation":63,"orcid":63},{"paper_id":9172,"author_seq":232,"given_name":9181,"surname":9182,"affiliation":63,"orcid":63},"Jill","Boberg",{"paper_id":9172,"author_seq":218,"given_name":9184,"surname":9185,"affiliation":63,"orcid":63},"Alesia","Gainer",{"paper_id":9172,"author_seq":203,"given_name":1370,"surname":9187,"affiliation":63,"orcid":63},"Gratch",{"paper_id":9172,"author_seq":188,"given_name":2211,"surname":8267,"affiliation":63,"orcid":63},{"paper_id":9172,"author_seq":172,"given_name":9190,"surname":9191,"affiliation":63,"orcid":63},"Anton","Leuski",{"paper_id":9172,"author_seq":155,"given_name":9193,"surname":7875,"affiliation":63,"orcid":63},"Gale",{"paper_id":9172,"author_seq":138,"given_name":1199,"surname":1200,"affiliation":63,"orcid":63},"The Niki and Julie corpus contains more than 600 dialogues between human participants and a human-controlled robot or virtual agent, engaged in a series of collaborative item-ranking tasks designed to measure influence. Some of the dialogues contain deliberate conversational errors by the robot, designed to simulate the kinds of conversational breakdown that are typical of present-day automated agents. Data collected include audio and video recordings, the results of the ranking tasks, and questionnaire responses; some of the recordings have been transcribed and annotated for verbal and nonverbal feedback. The corpus has been used to study influence and grounding in dialogue. All the dialogues are in American English.",{"paper_id":9197,"title":9198,"year":81,"month":855,"day":63,"doi":9199,"resource_url":9200,"first_page":63,"last_page":63,"pdf_url":9201,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9202,"paper_type":860,"authors":9203,"abstract":9209},"lrec2018-main-464","Constructing a Chinese Medical Conversation Corpus Annotated with Conversational Structures and Actions","10.63317\u002F3icrccmoufs2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-464","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F493.pdf","wang-etal-2018-constructing",[9204,9206,9208],{"paper_id":9197,"author_seq":247,"given_name":9205,"surname":1588,"affiliation":63,"orcid":63},"Nan",{"paper_id":9197,"author_seq":232,"given_name":9207,"surname":2599,"affiliation":63,"orcid":63},"Yan",{"paper_id":9197,"author_seq":218,"given_name":3121,"surname":1386,"affiliation":63,"orcid":63},"Overuse of antibiotics and the attributed bacterial resistance is one of the most serious global public health crises today. Previous research reported that patients' advocacy for antibiotic treatment was consequential on antibiotic over-prescribing. To investigate how the advocacy and other factors contribute to antibiotic over-prescribing, qualitative and quantitative analysis of doctor-patient conversation can yield valuable findings. In this paper, we introduce AMed (Annotated Corpus of Medical Conversations), a manually transcribed corpus of medical dialogue in Chinese pediatric consultations, with annotation of conversational structures and actions. Based on the annotation, a significant association between patient request for antibiotic and antibiotic over-prescribing is discovered. As this corpus is the first with annotation of conversational structures and actions on medical consultation conversations in Chinese, it can be a valuable resource for discourse and dialogue research in general, and for the understanding of human collaboration and negotiation behavior in clinical consultations in particular. Furthermore, findings from analyses of the corpus can shed light on ways to improve physician-patient communication in order to reduce antibiotic over-prescribing.",{"paper_id":9211,"title":9212,"year":81,"month":855,"day":63,"doi":9213,"resource_url":9214,"first_page":63,"last_page":63,"pdf_url":9215,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9216,"paper_type":860,"authors":9217,"abstract":9227},"lrec2018-main-465","Predicting Nods by using Dialogue Acts in Dialogue","10.63317\u002F2cqbjqga89wb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-465","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F501.pdf","ishii-etal-2018-predicting",[9218,9221,9224],{"paper_id":9211,"author_seq":247,"given_name":9219,"surname":9220,"affiliation":63,"orcid":63},"Ryo","Ishii",{"paper_id":9211,"author_seq":232,"given_name":9222,"surname":9223,"affiliation":63,"orcid":63},"Ryuichiro","Higashinaka",{"paper_id":9211,"author_seq":218,"given_name":9225,"surname":9226,"affiliation":63,"orcid":63},"Junji","Tomita","In addition to verbal behavior, nonverbal behavior is an important aspect for an embodied dialogue system to be able to conduct a smooth conversation with the user. Researchers have focused on automatically generating nonverbal behavior from speech and language information of dialogue systems. We propose a model to generate head nods accompanying utterance from natural language. To the best of our knowledge, previous studies generated nods from the final morphemes at the end of an utterance. In this study, we focused on dialog act information indicating the intention of an utterance and determined whether this information is effective for generating nods. First, we compiled a Japanese corpus of 24 dialogues including utterance and nod information. Next, using the corpus, we created a model that estimates whether a nod occurs during an utterance by using a morpheme at the end of a speech and dialog act. The results show that our estimation model incorporating dialog acts outperformed a model using morpheme information. The results suggest that dialog acts have the potential to be a strong predictor with which to generate nods automatically.",{"paper_id":9229,"title":9230,"year":81,"month":855,"day":63,"doi":9231,"resource_url":9232,"first_page":63,"last_page":63,"pdf_url":9233,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9234,"paper_type":860,"authors":9235,"abstract":9239},"lrec2018-main-466","Modeling Collaborative Multimodal Behavior in Group Dialogues: The MULTISIMO Corpus","10.63317\u002F5ffknhrev5q8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-466","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F596.pdf","koutsombogera-vogel-2018-modeling",[9236,9238],{"paper_id":9229,"author_seq":247,"given_name":2301,"surname":9237,"affiliation":63,"orcid":63},"Koutsombogera",{"paper_id":9229,"author_seq":232,"given_name":4210,"surname":4211,"affiliation":63,"orcid":63},"We present a multimodal corpus that has been recently developed within the MULTISIMO project and targets the investigation and modeling of collaborative aspects of multimodal behavior in groups that perform simple tasks. The corpus consists of a set of human-human interactions recorded in multiple modalities. In each interactive session two participants collaborate with each other to solve a quiz while assisted by a facilitator. The corpus has been transcribed and annotated with information related to verbal and non-verbal signals. A set of additional annotation and processing tasks are currently in progress. The corpus includes survey materials, i.e. personality tests and experience assessment questionnaires filled in by all participants. This dataset addresses multiparty collaborative interactions and aims at providing tools for measuring collaboration and task success based on the integration of the related multimodal information and the personality traits of the participants, but also at modeling the multimodal strategies that members of a group employ to discuss and collaborate with each other. The corpus is designed for public release.",{"paper_id":9241,"title":9242,"year":81,"month":855,"day":63,"doi":9243,"resource_url":9244,"first_page":63,"last_page":63,"pdf_url":9245,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9246,"paper_type":860,"authors":9247,"abstract":9266},"lrec2018-main-467","A Semi-autonomous System for Creating a Human-Machine Interaction Corpus in Virtual Reality: Application to the ACORFORMed System for Training Doctors to Break Bad News","10.63317\u002F32uq6hnoihy7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-467","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F660.pdf","ochs-etal-2018-semi",[9248,9251,9253,9256,9259,9262,9264],{"paper_id":9241,"author_seq":247,"given_name":9249,"surname":9250,"affiliation":63,"orcid":63},"Magalie","Ochs",{"paper_id":9241,"author_seq":232,"given_name":2089,"surname":9252,"affiliation":63,"orcid":63},"Blache",{"paper_id":9241,"author_seq":218,"given_name":9254,"surname":9255,"affiliation":63,"orcid":63},"Grégoire","de Montcheuil",{"paper_id":9241,"author_seq":203,"given_name":9257,"surname":9258,"affiliation":63,"orcid":63},"Jean-Marie","Pergandi",{"paper_id":9241,"author_seq":188,"given_name":9260,"surname":9261,"affiliation":63,"orcid":63},"Jorane","Saubesty",{"paper_id":9241,"author_seq":172,"given_name":2554,"surname":9263,"affiliation":63,"orcid":63},"Francon",{"paper_id":9241,"author_seq":155,"given_name":2554,"surname":9265,"affiliation":63,"orcid":63},"Mestre","In this paper, we introduce a two-step corpora-based methodology, starting from a corpus of human-human interactions to construct a semi-autonomous system in order to collect a new corpus of human-machine interaction, a step before the development of a fully autonomous system constructed based on the analysis of the collected corpora. The presented methodology is illustrated in the context of a virtual reality training platform for doctors breaking bad news.",{"paper_id":9268,"title":9269,"year":81,"month":855,"day":63,"doi":9270,"resource_url":9271,"first_page":63,"last_page":63,"pdf_url":9272,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9273,"paper_type":860,"authors":9274,"abstract":9285},"lrec2018-main-468","Construction of English-French Multimodal Affective Conversational Corpus from TV Dramas","10.63317\u002F4swtertzchk4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-468","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F751.pdf","novitasari-etal-2018-construction",[9275,9278,9280,9281,9284],{"paper_id":9268,"author_seq":247,"given_name":9276,"surname":9277,"affiliation":63,"orcid":63},"Sashi","Novitasari",{"paper_id":9268,"author_seq":232,"given_name":9279,"surname":2578,"affiliation":63,"orcid":63},"Quoc Truong",{"paper_id":9268,"author_seq":218,"given_name":4451,"surname":4452,"affiliation":63,"orcid":63},{"paper_id":9268,"author_seq":203,"given_name":9282,"surname":9283,"affiliation":63,"orcid":63},"Dessi","Lestari",{"paper_id":9268,"author_seq":188,"given_name":4454,"surname":4455,"affiliation":63,"orcid":63},"Recently, there has been an increase of interest in constructing corpora containing social-affective interactions. But the availability of multimodal, multilingual, and emotionally rich corpora remains limited. The tasks of recording and transcribing actual human-to-human affective conversations are also tedious and time-consuming. This paper describes construction of a multimodal affective conversational corpus based on TV dramas. The data contain parallel English-French languages in lexical, acoustic, and facial features. In addition, we annotated the part of the English data with speaker and emotion information. Our corpus can be utilized to develop and assess such tasks as speaker and emotion recognition, affective speech recognition and synthesis, linguistic, and paralinguistic speech-to-speech translation as well as a multimodal dialog system.",{"paper_id":9287,"title":9288,"year":81,"month":855,"day":63,"doi":9289,"resource_url":9290,"first_page":63,"last_page":63,"pdf_url":9291,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9292,"paper_type":860,"authors":9293,"abstract":9304},"lrec2018-main-469","QUEST: A Natural Language Interface to Relational Databases","10.63317\u002F399pd9wvqqqc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-469","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1100.pdf","sheinin-etal-2018-quest",[9294,9295,9298,9300,9302,9303],{"paper_id":9287,"author_seq":247,"given_name":1572,"surname":1573,"affiliation":63,"orcid":63},{"paper_id":9287,"author_seq":232,"given_name":9296,"surname":9297,"affiliation":63,"orcid":63},"Elahe","Khorashani",{"paper_id":9287,"author_seq":218,"given_name":9299,"surname":6665,"affiliation":63,"orcid":63},"Hangu",{"paper_id":9287,"author_seq":203,"given_name":9301,"surname":1585,"affiliation":63,"orcid":63},"Kun",{"paper_id":9287,"author_seq":188,"given_name":1569,"surname":1570,"affiliation":63,"orcid":63},{"paper_id":9287,"author_seq":172,"given_name":1566,"surname":1567,"affiliation":63,"orcid":63},"Natural language interfaces to databases systems allow the user to use natural language to interrogate a database. Current systems mainly focus on simple queries but neglect nested queries, which are predominant in real cases. We present a NLIDB system, QUEST, which is able to cope with nested logic queries, without imposing any restriction on the input query. QUEST outperforms a strong baseline system by 11% accuracy.",{"paper_id":9306,"title":9307,"year":81,"month":855,"day":63,"doi":9308,"resource_url":9309,"first_page":63,"last_page":63,"pdf_url":9310,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9311,"paper_type":860,"authors":9312,"abstract":9320},"lrec2018-main-470","TF-LM: TensorFlow-based Language Modeling Toolkit","10.63317\u002F36av4ouoagna","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-470","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F62.pdf","verwimp-etal-2018-tf",[9313,9316,9318],{"paper_id":9306,"author_seq":247,"given_name":9314,"surname":9315,"affiliation":63,"orcid":63},"Lyan","Verwimp",{"paper_id":9306,"author_seq":232,"given_name":2298,"surname":9317,"affiliation":63,"orcid":63},"Van hamme",{"paper_id":9306,"author_seq":218,"given_name":2451,"surname":9319,"affiliation":63,"orcid":63},"Wambacq","Recently, an abundance of deep learning toolkits has been made freely available. These toolkits typically offer the building blocks and sometimes simple example scripts, but designing and training a model still takes a considerable amount of time and knowledge. We present language modeling scripts based on TensorFlow that allow one to train and test competitive models directly, by using a pre-defined configuration or changing it to their needs. There are several options for input features (words, characters, words combined with characters, character n-grams) and for batching (sentence- or discourse-level). The models can be used to test the perplexity, predict the next word(s), re-score hypotheses or generate debugging files for interpolation with n-gram models. Additionally, we make available LSTM language models trained on a variety of Dutch texts and English benchmarks, that can be used immediately, thereby avoiding the time and computationally expensive training process. The toolkit is open source and can be found at https:\u002F\u002Fgithub.com\u002Flverwimp\u002Ftf-lm.",{"paper_id":9322,"title":9323,"year":81,"month":855,"day":63,"doi":9324,"resource_url":9325,"first_page":63,"last_page":63,"pdf_url":9326,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9327,"paper_type":860,"authors":9328,"abstract":9333},"lrec2018-main-471","Grapheme-level Awareness in Word Embeddings for Morphologically Rich Languages","10.63317\u002F23vbxeycgfas","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-471","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F133.pdf","park-shin-2018-grapheme",[9329,9331],{"paper_id":9322,"author_seq":247,"given_name":9330,"surname":5615,"affiliation":63,"orcid":63},"Suzi",{"paper_id":9322,"author_seq":232,"given_name":9332,"surname":3597,"affiliation":63,"orcid":63},"Hyopil","Learning word vectors from character level is an effective method to improve word embeddings for morphologically rich languages. However, most of these techniques have been applied to languages that are inflectional and written in Roman alphabets. In this paper, we investigate languages that are agglutinative and represented by non-alphabetic scripts, choosing Korean as a case study. We present a grapheme-level coding procedure for neural word embedding that utilizes word-internal features that are composed of syllable characters (Character CNN). Observing that our grapheme-level model is more capable of representing functional and semantic similarities, grouping allomorphs, and disambiguating homographs than syllable-level and word-level models, we recognize the importance of knowledge on the morphological typology and diversity of writing systems.",{"paper_id":9335,"title":9336,"year":81,"month":855,"day":63,"doi":9337,"resource_url":9338,"first_page":63,"last_page":63,"pdf_url":9339,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9340,"paper_type":860,"authors":9341,"abstract":9348},"lrec2018-main-472","Building a Constraint Grammar Parser for Plains Cree Verbs and Arguments","10.63317\u002F3x9fjezosbv9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-472","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F873.pdf","schmirler-etal-2018-building",[9342,9345,9346,9347],{"paper_id":9335,"author_seq":247,"given_name":9343,"surname":9344,"affiliation":63,"orcid":63},"Katherine","Schmirler",{"paper_id":9335,"author_seq":232,"given_name":6296,"surname":6297,"affiliation":63,"orcid":63},{"paper_id":9335,"author_seq":218,"given_name":7525,"surname":7526,"affiliation":63,"orcid":63},{"paper_id":9335,"author_seq":203,"given_name":7522,"surname":7523,"affiliation":63,"orcid":63},"This paper discusses the development and application of a Constraint Grammar parser for the Plains Cree language. The focus of this parser is the identification of relationships between verbs and arguments. The rich morphology and non-configurational syntax of Plains Cree make it an excellent candidate for the application of a Constraint Grammar parser, which is comprised of sets of constraints with two aims: 1) the disambiguation of ambiguous word forms, and 2) the mapping of syntactic relationships between word forms on the basis of morphological features and sentential context. Syntactic modelling of verb and argument relationships in Plains Cree is demonstrated to be a straightforward process, though various semantic and pragmatic features should improve the current parser considerably. When applied to even a relatively small corpus of Plains Cree, the Constraint Grammar parser allows for the identification of common word order patterns and for relationships between word order and information structure to become apparent.",{"paper_id":9350,"title":9351,"year":81,"month":855,"day":63,"doi":9352,"resource_url":9353,"first_page":63,"last_page":63,"pdf_url":9354,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9355,"paper_type":860,"authors":9356,"abstract":9361},"lrec2018-main-473","BPEmb: Tokenization-free Pre-trained Subword Embeddings in 275 Languages","10.63317\u002F3eqtu58tsqay","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-473","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1049.pdf","heinzerling-strube-2018-bpemb",[9357,9359],{"paper_id":9350,"author_seq":247,"given_name":2202,"surname":9358,"affiliation":63,"orcid":63},"Heinzerling",{"paper_id":9350,"author_seq":232,"given_name":1780,"surname":9360,"affiliation":63,"orcid":63},"Strube","We present BPEmb, a collection of pre-trained subword unit embeddings in 275 languages, based on Byte-Pair Encoding (BPE). In an evaluation using fine-grained entity typing as testbed, BPEmb performs competitively, and for some languages better than alternative subword approaches, while requiring vastly fewer resources and no tokenization. BPEmb is available at https:\u002F\u002Fgithub.com\u002Fbheinzerling\u002Fbpemb.",{"paper_id":9363,"title":9364,"year":81,"month":855,"day":63,"doi":9365,"resource_url":9366,"first_page":63,"last_page":63,"pdf_url":9367,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9368,"paper_type":860,"authors":9369,"abstract":9374},"lrec2018-main-474","Reference production in human-computer interaction: Issues for Corpus-based Referring Expression Generation","10.63317\u002F3ptqhunuagxi","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-474","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F4.pdf","rocha-paraboni-2018-reference",[9370,9373],{"paper_id":9363,"author_seq":247,"given_name":9371,"surname":9372,"affiliation":63,"orcid":63},"Danillo","Rocha",{"paper_id":9363,"author_seq":232,"given_name":4264,"surname":4265,"affiliation":63,"orcid":63},"In the Natural Language Generation field, Referring Expression Generation (REG) studies often make use of experiments involving human subjects for the collection of corpora of definite descriptions. Experiments of this kind usually make use of web-based settings in which a single subject acts as a speaker with no particular addressee in mind (as a kind of monologue situation), or in which participant pairs are engaged in an actual dialogue. Both so-called monologue and dialogue settings are of course instances of real language use, but it is not entirely clear whether these situations are truly comparable or, to be more precise, whether REG studies may draw conclusions regarding attribute selection, referential overspecification and others regardless of the mode of communication. To shed light on this issue, in this work we developed a parallel, semantically annotated  corpus of monologue and dialogue referring expressions, and carried out an experiment to compare instances produced in both modes of communication. Preliminary results suggest that human reference production may be indeed affected by the presence of a second (specific) human participant as the receiver of the communication in a number of ways, an observation that may be relevant for studies in REG and related fields.",{"paper_id":9376,"title":9377,"year":81,"month":855,"day":63,"doi":9378,"resource_url":9379,"first_page":63,"last_page":63,"pdf_url":9380,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9381,"paper_type":860,"authors":9382,"abstract":9386},"lrec2018-main-475","Definite Description Lexical Choice: taking Speaker’s Personality into account","10.63317\u002F2mxb5vjnwvtb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-475","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F35.pdf","lan-paraboni-2018-definite",[9383,9385],{"paper_id":9376,"author_seq":247,"given_name":2050,"surname":9384,"affiliation":63,"orcid":63},"Lan",{"paper_id":9376,"author_seq":232,"given_name":4264,"surname":4265,"affiliation":63,"orcid":63},"In Natural Language Generation (NLG), Referring Expression Generation (REG)  lexical choice is the subtask that provides words to express a given input meaning representation. Since lexical choices made in real language use tend to vary greatly across speakers, computational models of lexicalisation have long addressed the issue of human variation in the REG field as well. However, studies of this kind will often rely on large collections of pre-recorded linguistic examples produced by every single speaker of interest, and on every domain under consideration, to obtain meaning-to-text mappings from which the lexicalisation model is built. As a result, speaker-dependent lexicalisation may be impractical when suitable annotated corpora are not available. As an alternative to corpus-based approaches of this kind, this paper argues that differences across human speakers may be at least partially influenced by personality, and presents a personality-dependent lexical choice model for REG that is, to the best of our knowledge, the first of its kind. Preliminary results show that our personality-dependent approach outperforms a standard lexicalisation model (i.e., based on meaning-to-text mappings alone), and that the use of personality information may be a viable alternative to strategies that rely  on corpus knowledge.",{"paper_id":9388,"title":9389,"year":81,"month":855,"day":63,"doi":9390,"resource_url":9391,"first_page":63,"last_page":63,"pdf_url":9392,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9393,"paper_type":860,"authors":9394,"abstract":9398},"lrec2018-main-476","Referring Expression Generation in time-constrained communication","10.63317\u002F5e6ny5az5wiw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-476","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F39.pdf","mariotti-paraboni-2018-referring",[9395,9397],{"paper_id":9388,"author_seq":247,"given_name":4788,"surname":9396,"affiliation":63,"orcid":63},"Mariotti",{"paper_id":9388,"author_seq":232,"given_name":4264,"surname":4265,"affiliation":63,"orcid":63},"In  game-like applications and many others, an underlying Natural Language Generation system may have to express urgency or other dynamic aspects of a fast-evolving situation as text, which may be considerably different from text produced under so-called `normal' circumstances (e.g., without time constrains). As a means to shed light on possible differences of this kind,  this paper addresses  the computational generation of natural language text in time-constrained communication by presenting two experiments that use the attribute selection task of definite descriptions (or Referring Expression Generation - REG) as a working example. In the first experiment, we describe a psycholinguistic study in which human participants are engaged in a time-constrained reference production task. This results in a corpus of time-constrained descriptions to be compared with `normal' descriptions available from an existing (i.e., with no  time constraint) REG corpus. In the second experiment, we discuss how a REG algorithm may be customised so as to produce time-constrained descriptions that resemble those produced by human speakers in similar situations. The proposed algorithm is then evaluated against the time-constrained descriptions produced by the human subjects in the first experiment, and it is shown to outperform standard approaches to REG in these conditions.",{"paper_id":9400,"title":9401,"year":81,"month":855,"day":63,"doi":9402,"resource_url":9403,"first_page":63,"last_page":63,"pdf_url":9404,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9405,"paper_type":860,"authors":9406,"abstract":9414},"lrec2018-main-477","Incorporating Semantic Attention in Video Description Generation","10.63317\u002F578s3v4ussi3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-477","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F98.pdf","laokulrat-etal-2018-incorporating",[9407,9410,9413],{"paper_id":9400,"author_seq":247,"given_name":9408,"surname":9409,"affiliation":63,"orcid":63},"Natsuda","Laokulrat",{"paper_id":9400,"author_seq":232,"given_name":9411,"surname":9412,"affiliation":63,"orcid":63},"Naoaki","Okazaki",{"paper_id":9400,"author_seq":218,"given_name":8728,"surname":8729,"affiliation":63,"orcid":63},"Automatically generating video description is one of the approaches to enable computers to deeply understand videos, which can have a great impact and can be useful to many other applications. However, generated descriptions by computers often fail to correctly mention objects and actions appearing in the videos. This work aims to alleviate this problem by including external fine-grained visual information, which can be detected from all video frames, in the description generation model. In this paper, we propose an LSTM-based sequence-to-sequence model with semantic attention mechanism for video description generation. The model is flexible so that we can change the source of the external information without affecting the encoding and decoding parts of the model. The results show that using semantic attention to selectively focus on external fine-grained visual information can guide the system to correctly mention objects and actions in videos and have a better quality of video descriptions.",{"paper_id":9416,"title":9417,"year":81,"month":855,"day":63,"doi":9418,"resource_url":9419,"first_page":63,"last_page":63,"pdf_url":9420,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9421,"paper_type":860,"authors":9422,"abstract":9436},"lrec2018-main-478","GenDR: A Generic Deep Realizer with Complex Lexicalization","10.63317\u002F5htxpej9286e","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-478","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F450.pdf","lareau-etal-2018-gendr",[9423,9425,9428,9431,9433],{"paper_id":9416,"author_seq":247,"given_name":3525,"surname":9424,"affiliation":63,"orcid":63},"Lareau",{"paper_id":9416,"author_seq":232,"given_name":9426,"surname":9427,"affiliation":63,"orcid":63},"Florie","Lambrey",{"paper_id":9416,"author_seq":218,"given_name":9429,"surname":9430,"affiliation":63,"orcid":63},"Ieva","Dubinskaite",{"paper_id":9416,"author_seq":203,"given_name":2554,"surname":9432,"affiliation":63,"orcid":63},"Galarreta-Piquette",{"paper_id":9416,"author_seq":188,"given_name":9434,"surname":9435,"affiliation":63,"orcid":63},"Maryam","Nejat","We present a generic deep realizer called GenDR, which takes as input an abstract semantic representation of predicate-argument relations, and produces corresponding syntactic dependency structures in English, French, Lithuanian and Persian, with the possibility to fairly easily add more languages. It is generic in that it is designed to operate across a wide range of languages and applications, given the appropriate lexical resources. The focus is on the lexicalization of multiword expressions, with built-in rules to handle thousands of different cross-linguistic patterns of collocations (intensifiers, support verbs, causatives, etc.), and on rich paraphrasing, with the ability to produce many syntactically and lexically varied outputs from the same input. The system runs on a graph transducer, MATE, and its grammar design is directly borrowed from MARQUIS, which we have trimmed down to its core and built upon. The grammar and demo dictionaries are distributed under a CC-BY-SA licence (http:\u002F\u002Fbit.ly\u002F2x8xGVO). This paper explains the design of the grammar, how multiword expressions (especially collocations) are dealt with, and how the syntactic structure is derived from the relative communicative salience of the meanings involved.",{"paper_id":9438,"title":9439,"year":81,"month":855,"day":63,"doi":9440,"resource_url":9441,"first_page":63,"last_page":63,"pdf_url":9442,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9443,"paper_type":860,"authors":9444,"abstract":9451},"lrec2018-main-479","A Detailed Evaluation of Neural Sequence-to-Sequence Models for In-domain and Cross-domain Text Simplification","10.63317\u002F26hi9ayu28ui","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-479","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F620.pdf","stajner-nisioi-2018-detailed",[9445,9448],{"paper_id":9438,"author_seq":247,"given_name":9446,"surname":9447,"affiliation":63,"orcid":63},"Sanja","Štajner",{"paper_id":9438,"author_seq":232,"given_name":9449,"surname":9450,"affiliation":63,"orcid":63},"Sergiu","Nisioi","We present a detailed evaluation and analysis of neural sequence-to-sequence models for text simplification on two distinct datasets: Simple Wikipedia and Newsela. We employ both human and automatic evaluation to investigate the capacity of neural models to generalize across corpora, and we highlight challenges that these models face when tested on a different genre. Furthermore, we establish a strong baseline on the Newsela dataset and show that a simple neural architecture can be efficiently used for in-domain and cross-domain text simplification.",{"paper_id":9453,"title":9454,"year":81,"month":855,"day":63,"doi":9455,"resource_url":9456,"first_page":63,"last_page":63,"pdf_url":9457,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9458,"paper_type":860,"authors":9459,"abstract":9469},"lrec2018-main-480","Don’t Annotate, but Validate: a Data-to-Text Method for Capturing Event Data","10.63317\u002F28x4v6psj3no","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-480","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F667.pdf","vossen-etal-2018-dont",[9460,9461,9463,9466],{"paper_id":9453,"author_seq":247,"given_name":4174,"surname":4175,"affiliation":63,"orcid":63},{"paper_id":9453,"author_seq":232,"given_name":6208,"surname":9462,"affiliation":63,"orcid":63},"Ilievski",{"paper_id":9453,"author_seq":218,"given_name":9464,"surname":9465,"affiliation":63,"orcid":63},"Marten","Postma",{"paper_id":9453,"author_seq":203,"given_name":9467,"surname":9468,"affiliation":63,"orcid":63},"Roxane","Segers","In this paper, we present a new method to obtain large volumes of high-quality text corpora with event data for studying identity and reference relations. We report on the current methods to create event reference data by annotating texts and deriving the event data a posteriori. Our method starts from event registries in which event data is defined a priori. From this data, we extract so-called Microworlds of referential data with the Reference Texts that report on these events. This makes it possible to establish referential relations with high precision easily and at a large scale. In a pilot, we successfully obtained data from these resources with extreme ambiguity and variation, while maintaining the identity and reference relations and without having to annotate large quantities of texts word-by-word. The data from this pilot was annotated using an annotation tool created specifically in order to validate our method and to enrich the reference texts with event coreference annotations. This annotation process resulted in the Gun Violence Corpus, whose development process and outcome are described in this paper.",{"paper_id":9471,"title":9472,"year":81,"month":855,"day":63,"doi":9473,"resource_url":9474,"first_page":63,"last_page":63,"pdf_url":9475,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9476,"paper_type":860,"authors":9477,"abstract":9490},"lrec2018-main-481","RDF2PT: Generating Brazilian Portuguese Texts from RDF Data","10.63317\u002F3jfb8hkkoosa","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-481","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F783.pdf","moussallem-etal-2018-rdf2pt",[9478,9479,9481,9482,9485,9488,9489],{"paper_id":9471,"author_seq":247,"given_name":7945,"surname":7946,"affiliation":63,"orcid":63},{"paper_id":9471,"author_seq":232,"given_name":3818,"surname":9480,"affiliation":63,"orcid":63},"Ferreira",{"paper_id":9471,"author_seq":218,"given_name":1656,"surname":7952,"affiliation":63,"orcid":63},{"paper_id":9471,"author_seq":203,"given_name":9483,"surname":9484,"affiliation":63,"orcid":63},"Maria Claudia","Cavalcanti",{"paper_id":9471,"author_seq":188,"given_name":9486,"surname":9487,"affiliation":63,"orcid":63},"Geraldo","Xexéo",{"paper_id":9471,"author_seq":172,"given_name":1756,"surname":1757,"affiliation":63,"orcid":63},{"paper_id":9471,"author_seq":155,"given_name":7954,"surname":7955,"affiliation":63,"orcid":63},"The generation of natural language from RDF data has recently gained significant attention due to the continuous growth of Linked Data. A number of these approaches generate natural language in languages other than English, however, no work has been proposed to generate Brazilian Portuguese texts out of RDF. We address this research gap by presenting RDF2PT, an approach that verbalizes RDF data to Brazilian Portuguese language. We evaluated RDF2PT in an open questionnaire with 44 native speakers divided into experts and non-experts. Our results suggest that RDF2PT is able to generate text which is similar to that generated by humans and can hence be easily understood.",{"paper_id":9492,"title":9493,"year":81,"month":855,"day":63,"doi":9494,"resource_url":9495,"first_page":63,"last_page":63,"pdf_url":9496,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9497,"paper_type":860,"authors":9498,"abstract":9503},"lrec2018-main-482","Towards a music-language mapping","10.63317\u002F3ycjiaz3v7gc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-482","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F831.pdf","berlingerio-bonin-2018-towards",[9499,9502],{"paper_id":9492,"author_seq":247,"given_name":9500,"surname":9501,"affiliation":63,"orcid":63},"Michele","Berlingerio",{"paper_id":9492,"author_seq":232,"given_name":7739,"surname":7740,"affiliation":63,"orcid":63},"We explore a novel research idea, that we call Musical Language   Processing (MLP), which investigates the possibility of a musical   input to speech interaction systems. We present the first     attempts at finding a mapping between musical pieces and dialogues,   based on the frequency of musical patterns. Our findings on one   possible    alignment between classical piano compositions and dialogues from   popular TV series are encouraging, and open the way to further   investigations along this line of research.",{"paper_id":9505,"title":9506,"year":81,"month":855,"day":63,"doi":9507,"resource_url":9508,"first_page":63,"last_page":63,"pdf_url":9509,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9510,"paper_type":860,"authors":9511,"abstract":9519},"lrec2018-main-483","Up-cycling Data for Natural Language Generation","10.63317\u002F4vv8dtefbah2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-483","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F927.pdf","isard-etal-2018-cycling",[9512,9515,9517],{"paper_id":9505,"author_seq":247,"given_name":9513,"surname":9514,"affiliation":63,"orcid":63},"Amy","Isard",{"paper_id":9505,"author_seq":232,"given_name":1785,"surname":9516,"affiliation":63,"orcid":63},"Oberlander",{"paper_id":9505,"author_seq":218,"given_name":1217,"surname":9518,"affiliation":63,"orcid":63},"Grover","Museums and other cultural heritage institutions have large databases of information about the objects in their collections, and existing Natural Language Generation (NLG) systems can generate fluent and adaptive texts for visitors, given appropriate input data, but there is typically a large amount of expert human effort required to bridge the gap between the available and the required data. We describe automatic processes which aim to significantly reduce the need for expert input during the conversion and up-cycling process. We detail domain-independent techniques for processing and enhancing data into a format which allows an existing NLG system to create adaptive texts. First we normalize the dates and names which occur in the data, and we link to the Semantic Web to add extra object descriptions. Then we use Semantic Web queries combined with a wide coverage grammar of English to extract relations which can be used to express the content of database fields in language accessible to a general user.  As our test domain we use a database from the Edinburgh Musical Instrument Museum.",{"paper_id":9521,"title":9522,"year":81,"month":855,"day":63,"doi":9523,"resource_url":9524,"first_page":63,"last_page":63,"pdf_url":9525,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9526,"paper_type":860,"authors":9527,"abstract":9533},"lrec2018-main-484","Neural Models of Selectional Preferences for Implicit Semantic Role Labeling","10.63317\u002F26w4os8fcfow","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-484","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F624.pdf","le-fokkens-2018-neural",[9528,9530],{"paper_id":9521,"author_seq":247,"given_name":9529,"surname":2400,"affiliation":63,"orcid":63},"Minh",{"paper_id":9521,"author_seq":232,"given_name":9531,"surname":9532,"affiliation":63,"orcid":63},"Antske","Fokkens","Implicit Semantic Role Labeling is a challenging task: it requires high-level understanding of the text while annotated data is very limited. Due to the lack of training data, most researches either resort to simplistic machine learning methods or focus on automatically acquiring training data. In this paper, we explore the possibilities of using more complex and expressive machine learning models trained on a large amount of explicit roles. In addition, we compare the impact of one-way and multi-way selectional preference with the hypothesis that the added information in multi-way models are beneficial. Although our models surpass a baseline that uses prototypical vectors for SemEval-2010, we otherwise face mostly negative results. Selectional preference models perform lower than the baseline on ON5V, a dataset of five ambiguous and frequent verbs. They are also outperformed by the Na ̈ıve Bayes model of Feizabadi and Pado (2015) on both datasets. Even though multi-way selectional preference improves results for predicting explicit semantic roles compared to one-way selectional preference, it harms performance for implicit roles. We release our source code, including the reimplementation of two previously unavailable systems to enable further experimentation.",{"paper_id":9535,"title":9536,"year":81,"month":855,"day":63,"doi":9537,"resource_url":9538,"first_page":63,"last_page":63,"pdf_url":9539,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9540,"paper_type":860,"authors":9541,"abstract":9548},"lrec2018-main-485","A database of German definitory contexts from selected web sources","10.63317\u002F49mk6ujedbph","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-485","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F668.pdf","barbaresi-etal-2018-database",[9542,9543,9546],{"paper_id":9535,"author_seq":247,"given_name":3314,"surname":3315,"affiliation":63,"orcid":63},{"paper_id":9535,"author_seq":232,"given_name":9544,"surname":9545,"affiliation":63,"orcid":63},"Lothar","Lemnitzer",{"paper_id":9535,"author_seq":218,"given_name":2736,"surname":9547,"affiliation":63,"orcid":63},"Geyken","We introduce work on the detection of definitory contexts designed to speed up two lexicographical tasks: searching for the exact meaning(s) of terms and providing usable input for paraphrasing. Our database is built from a specialized web corpus using a robust pattern-based extraction method. The corresponding interface displays information for a large range of lexical units. The contributions of this article are threefold: we describe both acquisition and extraction, provide a qualitative assessment of the method, and present an interface to access the data.",{"paper_id":9550,"title":9551,"year":81,"month":855,"day":63,"doi":9552,"resource_url":9553,"first_page":63,"last_page":63,"pdf_url":9554,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9555,"paper_type":860,"authors":9556,"abstract":9563},"lrec2018-main-486","Annotating Abstract Meaning Representations for Spanish","10.63317\u002F4v64ya2w4mu8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-486","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F743.pdf","migueles-abraira-etal-2018-annotating",[9557,9560,9561],{"paper_id":9550,"author_seq":247,"given_name":9558,"surname":9559,"affiliation":63,"orcid":63},"Noelia","Migueles-Abraira",{"paper_id":9550,"author_seq":232,"given_name":1953,"surname":7501,"affiliation":63,"orcid":63},{"paper_id":9550,"author_seq":218,"given_name":3288,"surname":9562,"affiliation":63,"orcid":63},"Diaz de Ilarraza","Until recently, semantic annotations for different semantic phenomena were independent and unconnected. The Abstract Meaning Representation (AMR) project arised out of the need to create a broad-coverage semantic bank containing a unified set of semantic information represented in simple, single-rooted, easy-to-read structures. Because the semantic representation language proposed in AMR is biased towards English, annotating AMR structures for other languages, such as Spanish, is not a trivial task. In this paper we propose a linguistic method that we believe would help lay the groundwork for building a large semantic bank for Spanish and would guide those who would like to implement it for other languages. Thus, we analyze a broad spectrum of Spanish linguistic phenomena to come up with suggestions to adapt the current guidelines so that it is possible to annotate AMRs for Spanish. As a result of this work, we make available the first public online repository containing manually annotated Spanish AMRs.",{"paper_id":9565,"title":9566,"year":81,"month":855,"day":63,"doi":9567,"resource_url":9568,"first_page":63,"last_page":63,"pdf_url":9569,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9570,"paper_type":860,"authors":9571,"abstract":9579},"lrec2018-main-487","Browsing the Terminological Structure of a Specialized Domain: A Method Based on Lexical Functions and their Classification","10.63317\u002F4fuh9fxc8bfe","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-487","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F840.pdf","lhomme-etal-2018-browsing",[9572,9575,9577],{"paper_id":9565,"author_seq":247,"given_name":9573,"surname":9574,"affiliation":63,"orcid":63},"Marie-Claude","L’Homme",{"paper_id":9565,"author_seq":232,"given_name":6238,"surname":9576,"affiliation":63,"orcid":63},"Robichaud",{"paper_id":9565,"author_seq":218,"given_name":6874,"surname":9578,"affiliation":63,"orcid":63},"Prévil","This paper describes a method for browsing relations between terms and unveiling the terminological structure of a specialized domain. The method consists in expanding a graph that takes as input the relations encoded in a multilingual terminological resource called the DiCoEnviro that contains terms in the field of the environment. In the DiCoEnviro, terminological relations are encoded using lexical functions (Melčuk et al. 1995) and further classified in families defined on the basis of the properties of relations. We seek to provide users with an explicit and intuitive representation of a wide variety of relations. We also make the most of the richness of the encoding, while implementing some graphical choices to make their interpretation as clear as possible for end users. The method is implemented in a tool called NeoVisual that provides access to more than 11,000 relations in English and 15,000 relations in French. Portuguese is also included and coverage in all languages will increase as new entries are added to the DiCoEnviro.",{"paper_id":9581,"title":9582,"year":81,"month":855,"day":63,"doi":9583,"resource_url":9584,"first_page":63,"last_page":63,"pdf_url":9585,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9586,"paper_type":860,"authors":9587,"abstract":9596},"lrec2018-main-488","Rollenwechsel-English: a large-scale semantic role corpus","10.63317\u002F4atchm3v4amc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-488","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F922.pdf","sayeed-etal-2018-rollenwechsel",[9588,9591,9593],{"paper_id":9581,"author_seq":247,"given_name":9589,"surname":9590,"affiliation":63,"orcid":63},"Asad","Sayeed",{"paper_id":9581,"author_seq":232,"given_name":4670,"surname":9592,"affiliation":63,"orcid":63},"Shkadzko",{"paper_id":9581,"author_seq":218,"given_name":9594,"surname":9595,"affiliation":63,"orcid":63},"Vera","Demberg","We present the Rollenwechsel-English (RW-eng) corpus, a large corpus of automatically-labelled semantic frames extracted from the ukWaC corpus and BNC using Propbank roles.  RW-eng contains both full-phrase constituents for labelled roles as well as heads identified by a series of heuristics.  This corpus is of a scale and size suitable for new deep learning approaches to language modelling and distributional semantics, particularly as it pertains to generalized event knowledge.  We describe the structure of this corpus, tools for its use, and successful use cases.",{"paper_id":9598,"title":9599,"year":81,"month":855,"day":63,"doi":9600,"resource_url":9601,"first_page":63,"last_page":63,"pdf_url":9602,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9603,"paper_type":860,"authors":9604,"abstract":9614},"lrec2018-main-489","Towards a Standardized Dataset for Noun Compound Interpretation","10.63317\u002F2j4s9i9hsc9w","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-489","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F937.pdf","ponkiya-etal-2018-towards",[9605,9608,9610,9611],{"paper_id":9598,"author_seq":247,"given_name":9606,"surname":9607,"affiliation":63,"orcid":63},"Girishkumar","Ponkiya",{"paper_id":9598,"author_seq":232,"given_name":2184,"surname":9609,"affiliation":63,"orcid":63},"Patel",{"paper_id":9598,"author_seq":218,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},{"paper_id":9598,"author_seq":203,"given_name":9612,"surname":9613,"affiliation":63,"orcid":63},"Girish K","Palshikar","Noun compounds are interesting constructs in Natural Language Processing (NLP). Interpretation of noun compounds is the task of uncovering a relationship between component nouns of a noun compound. There has not been much progress in this field due to lack of a standardized set of relation inventory and associated annotated dataset which can be used to evaluate suggested solutions. Available datasets in the literature suffer from two problems. Firstly, the approaches to creating some of the relation inventories and datasets are statistically motivated, rather than being linguistically motivated. Secondly, there is little overlap among the semantic relation inventories used by them. We attempt to bridge this gap through our paper. We present a dataset that is (a) linguistically grounded by using Levi (1978)'s theory, and (b) uses frame elements of FrameNet as its semantic relation inventory. The dataset consists of 2,600 examples created by an automated extraction from FrameNet annotated corpus, followed by a manual investigation. These attributes make our dataset useful for noun compound interpretation in a general-purpose setting.",{"paper_id":9616,"title":9617,"year":81,"month":855,"day":63,"doi":9618,"resource_url":9619,"first_page":63,"last_page":63,"pdf_url":9620,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9621,"paper_type":860,"authors":9622,"abstract":9628},"lrec2018-main-490","Structured Interpretation of Temporal Relations","10.63317\u002F4mbueiq9zvj2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-490","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F985.pdf","zhang-xue-2018-structured",[9623,9625],{"paper_id":9616,"author_seq":247,"given_name":9624,"surname":2381,"affiliation":63,"orcid":63},"Yuchen",{"paper_id":9616,"author_seq":232,"given_name":9626,"surname":9627,"affiliation":63,"orcid":63},"Nianwen","Xue","Temporal relations between events and time expressions in a document are often modeled in an unstructured manner where relations between individual pairs of time expressions and events are considered in isolation. This often results in inconsistent and incomplete annotation and computational modeling. We propose a novel annotation approach where events and time expressions in a document form a dependency tree in which each dependency relation corresponds to an instance of temporal anaphora where the antecedent is the parent and the anaphor is the child. We annotate a corpus of 235 documents using this approach in the two genres of news and narratives, with 48 documents doubly annotated. We report a stable and high inter-annotator agreement on the doubly annotated subset, validating our approach, and perform a quantitative comparison between the two genres of the entire corpus. We make this corpus publicly available.",{"paper_id":9630,"title":9631,"year":81,"month":855,"day":63,"doi":9632,"resource_url":9633,"first_page":63,"last_page":63,"pdf_url":9634,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9635,"paper_type":860,"authors":9636,"abstract":9648},"lrec2018-main-491","NL2Bash: A Corpus and Semantic Parser for Natural Language Interface to the Linux Operating System","10.63317\u002F42er6x4bacdf","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-491","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1021.pdf","lin-etal-2018-nl2bash",[9637,9640,9642,9645],{"paper_id":9630,"author_seq":247,"given_name":9638,"surname":9639,"affiliation":63,"orcid":63},"Xi Victoria","Lin",{"paper_id":9630,"author_seq":232,"given_name":9641,"surname":1588,"affiliation":63,"orcid":63},"Chenglong",{"paper_id":9630,"author_seq":218,"given_name":9643,"surname":9644,"affiliation":63,"orcid":63},"Luke","Zettlemoyer",{"paper_id":9630,"author_seq":203,"given_name":9646,"surname":9647,"affiliation":63,"orcid":63},"Michael D.","Ernst","We present new data and semantic parsing methods for the problem of mapping English sentences to Bash commands (NL2Bash). Our long-term goal is to enable any user to perform otherwise repetitive computer operations (such as file manipulation, search, and application-specific scripting) by simply stating their goals in English. We take a first step in this domain, by providing a large new dataset of challenging but commonly used Bash commands and expert-written English descriptions, along with the baseline methods to establish performance levels on this task.",{"paper_id":9650,"title":9651,"year":81,"month":855,"day":63,"doi":9652,"resource_url":9653,"first_page":63,"last_page":63,"pdf_url":9654,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9655,"paper_type":860,"authors":9656,"abstract":9666},"lrec2018-main-492","World Knowledge for Abstract Meaning Representation Parsing","10.63317\u002F4aatrxmmevba","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-492","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1085.pdf","welch-etal-2018-world",[9657,9659,9662,9663],{"paper_id":9650,"author_seq":247,"given_name":7736,"surname":9658,"affiliation":63,"orcid":63},"Welch",{"paper_id":9650,"author_seq":232,"given_name":9660,"surname":9661,"affiliation":63,"orcid":63},"Jonathan K.","Kummerfeld",{"paper_id":9650,"author_seq":218,"given_name":2599,"surname":2590,"affiliation":63,"orcid":63},{"paper_id":9650,"author_seq":203,"given_name":9664,"surname":9665,"affiliation":63,"orcid":63},"Rada","Mihalcea","World Knowledge for Abstract Meaning Representation Parsing  Charles Welch, Jonathan K. Kummerfeld, Song Feng, Rada Mihalcea  In this paper we explore the role played by world knowledge in semantic parsing. We look at the types of errors that currently exist in a state-of-the-art Abstract Meaning Representation (AMR) parser, and explore the problem of how to integrate world knowledge to reduce these errors. We look at three types of knowledge from (1) WordNet hypernyms and super senses, (2) Wikipedia entity links, and (3) retraining a named entity recognizer to identify concepts in AMR. The retrained entity recognizer is not perfect and cannot recognize all concepts in AMR and we examine the limitations of the named entity features using a set of oracles. The oracles show how performance increases if it can recognize different subsets of AMR concepts. These results show improvement on multiple fine-grained metrics, including a 6% increase in named entity F-score, and provide insight into the potential of world knowledge for future work in Abstract Meaning Representation parsing.",{"paper_id":9668,"title":9669,"year":81,"month":855,"day":63,"doi":9670,"resource_url":9671,"first_page":63,"last_page":63,"pdf_url":9672,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9673,"paper_type":860,"authors":9674,"abstract":9682},"lrec2018-main-493","Improved Transcription and Indexing of Oral History Interviews for Digital Humanities Research","10.63317\u002F578awfya4g2a","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-493","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F137.pdf","gref-etal-2018-improved",[9675,9677,9679],{"paper_id":9668,"author_seq":247,"given_name":1780,"surname":9676,"affiliation":63,"orcid":63},"Gref",{"paper_id":9668,"author_seq":232,"given_name":6380,"surname":9678,"affiliation":63,"orcid":63},"Köhler",{"paper_id":9668,"author_seq":218,"given_name":9680,"surname":9681,"affiliation":63,"orcid":63},"Almut","Leh","This paper describes different approaches to improve the transcription and indexing quality of the Fraunhofer IAIS Audio Mining system on Oral History interviews for the Digital Humanities Research. As an essential component of the Audio Mining system, automatic speech recognition faces a lot of difficult challenges when processing Oral History interviews. We aim to overcome these challenges using state-of-the-art automatic speech recognition technology. Different acoustic modeling techniques, like multi-condition training and sophisticated neural networks, are applied to train robust acoustic models. To evaluate the performance of these models on Oral History interviews a German Oral History test-set is presented. This test-set represents the large audio-visual archives \"Deutsches Gedächtnis\" of the Institute for History and Biography. The combination of the different applied techniques results in a word error rate reduced by 28.3% relative on this test-set compared to the current baseline system while only one eighth of the previous amount of training data is used. In context of these experiments new opportunities are set out for Oral History research offered by Audio Mining. Also the workflow is described used by Audio Mining to process long audio-files to automatically create time-aligned transcriptions.",{"paper_id":9684,"title":9685,"year":81,"month":855,"day":63,"doi":9686,"resource_url":9687,"first_page":63,"last_page":63,"pdf_url":9688,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9689,"paper_type":860,"authors":9690,"abstract":9701},"lrec2018-main-494","Sound Signal Processing with Seq2Tree Network","10.63317\u002F4ze3sdpbibme","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-494","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F164.pdf","ma-etal-2018-sound",[9691,9693,9695,9698,9700],{"paper_id":9684,"author_seq":247,"given_name":9692,"surname":3400,"affiliation":63,"orcid":63},"Weicheng",{"paper_id":9684,"author_seq":232,"given_name":9694,"surname":7336,"affiliation":63,"orcid":63},"Kai",{"paper_id":9684,"author_seq":218,"given_name":9696,"surname":9697,"affiliation":63,"orcid":63},"Zhaoheng","Ni",{"paper_id":9684,"author_seq":203,"given_name":1474,"surname":9699,"affiliation":63,"orcid":63},"Chin",{"paper_id":9684,"author_seq":188,"given_name":2398,"surname":1591,"affiliation":63,"orcid":63},"Long Short-Term Memory (LSTM) and its variants have been the standard solution to sequential data processing tasks because of their ability to preserve previous information weighted on distance. This feature provides the LSTM family with additional information in predictions, compared to regular Recurrent Neural Networks (RNNs) and Bag-of-Words (BOW) models. In other words, LSTM networks assume the data to be chain-structured. The longer the distance between two data points, the less related the data points are. However, this is usually not the case for real multimedia signals including text, sound and music. In real data, this chain-structured dependency exists only across meaningful groups of data units but not over single units directly. For example, in a prediction task over sound signals, a meaningful word could give a strong hint to its following word as a whole but not the first phoneme of that word. This undermines the ability of LSTM networks in modeling multimedia data, which is pattern-rich. In this paper we take advantage of Seq2Tree network, a dynamically extensible tree-structured neural network architecture which helps solve the problem LSTM networks face in sound signal processing tasks-the unbalanced connections among data units inside and outside semantic groups. Experiments show that Seq2Tree network outperforms the state-of-the-art Bidirectional LSTM (BLSTM) model on a signal and noise separation task (CHiME Speech Separation and Recognition Challenge).",{"paper_id":9703,"title":9704,"year":81,"month":855,"day":63,"doi":9705,"resource_url":9706,"first_page":63,"last_page":63,"pdf_url":9707,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9708,"paper_type":860,"authors":9709,"abstract":9722},"lrec2018-main-495","Open ASR for Icelandic: Resources and a Baseline System","10.63317\u002F27mshnxhigos","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-495","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F201.pdf","nikulasdottir-etal-2018-open",[9710,9713,9716,9719],{"paper_id":9703,"author_seq":247,"given_name":9711,"surname":9712,"affiliation":63,"orcid":63},"Anna Björk","Nikulásdóttir",{"paper_id":9703,"author_seq":232,"given_name":9714,"surname":9715,"affiliation":63,"orcid":63},"Inga Rún","Helgadóttir",{"paper_id":9703,"author_seq":218,"given_name":9717,"surname":9718,"affiliation":63,"orcid":63},"Matthías","Pétursson",{"paper_id":9703,"author_seq":203,"given_name":9720,"surname":9721,"affiliation":63,"orcid":63},"Jón","Guðnason","Developing language resources is an important task when creating a speech recognition system for a less-resourced language. In this paper we describe available language resources and their preparation for use in a large vocabulary speech recognition (LVSR) system for Icelandic. The content of a speech corpus is analysed and training and test sets compiled, a pronunciation dictionary is extended, and text normalization for language modeling performed. An ASR system based on neural networks is implemented using these resources and tested using different acoustic training sets. Experimental results show a clear increase in word-error-rate (WER) when using smaller training sets, indicating that extension of the speech corpus for training would improve the system. When testing on data with known vocabulary only, the WER is 7.99%, but on an open vocabulary test set the WER is 15.72%. Furthermore, impact of the content of the acoustic training corpus is examined. The current results indicate that an ASR system could profit from carefully selected phonotactical data, however, further experiments are needed to verify this impression. The language resources are available on http:\u002F\u002Fmalfong.is and the source code of the project can be found on https:\u002F\u002Fgithub.com\u002Fcadia-lvl\u002Fice-asr\u002Ftree\u002Fmaster\u002Fice-kaldi.",{"paper_id":9724,"title":9725,"year":81,"month":855,"day":63,"doi":9726,"resource_url":9727,"first_page":63,"last_page":63,"pdf_url":9728,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9729,"paper_type":860,"authors":9730,"abstract":9737},"lrec2018-main-496","Towards Neural Speaker Modeling in Multi-Party Conversation: The Task, Dataset, and Models","10.63317\u002F3bau89hm3k8v","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-496","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F411.pdf","meng-etal-2018-towards",[9731,9732,9734],{"paper_id":9724,"author_seq":247,"given_name":3578,"surname":2720,"affiliation":63,"orcid":63},{"paper_id":9724,"author_seq":232,"given_name":1620,"surname":9733,"affiliation":63,"orcid":63},"Mou",{"paper_id":9724,"author_seq":218,"given_name":9735,"surname":9736,"affiliation":63,"orcid":63},"Zhi","Jin","Neural network-based dialog systems are attracting increasing attention in both academia and industry. Recently, researchers have begun to realize the importance of speaker modeling in neural dialog systems, but there lacks established tasks and datasets. In this paper, we propose speaker classification}as a surrogate task for general speaker modeling, and collect massive data to facilitate research in this direction. We further investigate temporal-based and content-based models of speakers, and propose several hybrids of them. Experiments show that speaker classification is feasible, and that hybrid models outperform each single component.",{"paper_id":9739,"title":9740,"year":81,"month":855,"day":63,"doi":9741,"resource_url":9742,"first_page":63,"last_page":63,"pdf_url":9743,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9744,"paper_type":860,"authors":9745,"abstract":9755},"lrec2018-main-497","Discriminating between Similar Languages on Imbalanced Conversational Texts","10.63317\u002F2w3hahaptbwb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-497","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F430.pdf","he-etal-2018-discriminating",[9746,9748,9750,9752,9753],{"paper_id":9739,"author_seq":247,"given_name":9747,"surname":1425,"affiliation":63,"orcid":63},"Junqing",{"paper_id":9739,"author_seq":232,"given_name":9749,"surname":3506,"affiliation":63,"orcid":63},"Xian",{"paper_id":9739,"author_seq":218,"given_name":9751,"surname":3578,"affiliation":63,"orcid":63},"Xuemin",{"paper_id":9739,"author_seq":203,"given_name":9207,"surname":2381,"affiliation":63,"orcid":63},{"paper_id":9739,"author_seq":188,"given_name":9754,"surname":9207,"affiliation":63,"orcid":63},"Yonghong","Discriminating between similar languages (DSL) on conversational texts is a challenging task. This paper aims at discriminating between limited-resource languages on short conversational texts, like Uyghur and Kazakh. Considering that Uyghur and Kazakh data are severely imbalanced, we leverage an effective compensation strategy to build a balanced Uyghur and Kazakh corpus. Then we construct a maximum entropy classifier based on morphological features to discriminate between the two languages and investigate the contribution of each feature. Empirical results suggest that our system achieves an accuracy of 95.7\\% on our Uyghur and Kazakh dataset, which is higher than that of the CNN classifier. We also apply our system to the out-of-domain subtask of VarDial' 2016 DSL shared tasks to test the system's performance on short conversational texts of other similar languages. Though with much less preprocessing, our system outperforms the champions on both test sets B1 and B2.",{"paper_id":9757,"title":9758,"year":81,"month":855,"day":63,"doi":9759,"resource_url":9760,"first_page":63,"last_page":63,"pdf_url":9761,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9762,"paper_type":860,"authors":9763,"abstract":9768},"lrec2018-main-498","Data-Driven Pronunciation Modeling of Swiss German Dialectal Speech for Automatic Speech Recognition","10.63317\u002F3kmm3b6u9nj7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-498","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F559.pdf","stadtschnitzer-schmidt-2018-data",[9764,9766],{"paper_id":9757,"author_seq":247,"given_name":1780,"surname":9765,"affiliation":63,"orcid":63},"Stadtschnitzer",{"paper_id":9757,"author_seq":232,"given_name":9767,"surname":6653,"affiliation":63,"orcid":63},"Christoph","Automatic speech recognition is a requested technique in many fields like automatic subtitling, dialogue systems and information retrieval systems. The training of an automatic speech recognition system is usually straight forward given a large annotated speech corpus for acoustic modeling, a phonetic lexicon, and a text corpus for the training of a language model. However, in some use cases these resources are not available. In this work, we discuss the training of a Swiss German speech recognition system. The only resources that are available is a small size audio corpus, containing the utterances of highly dialectical Swiss German speakers, annotated with a standard German transcription. The desired output of the speech recognizer is again standard German, since there is no other official or standardized way to write Swiss German. We explore strategies to cope with the mismatch between the dialectal pronunciation and the standard German annotation. A Swiss German speech recognizer is trained by adapting a standard German model, based on a Swiss German grapheme-to-phoneme conversion model, which was learned in a data-driven manner. Also, Swiss German speech recognition systems are created, with the pronunciation based on graphemes, standard German pronunciation and with a data-driven Swiss German pronunciation model. The results of the experiments are promising for this challenging task.",{"paper_id":9770,"title":9771,"year":81,"month":855,"day":63,"doi":9772,"resource_url":9773,"first_page":63,"last_page":63,"pdf_url":9774,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9775,"paper_type":860,"authors":9776,"abstract":9784},"lrec2018-main-499","Simulating ASR errors for training SLU systems","10.63317\u002F2varfyg7dorv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-499","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F827.pdf","simonnet-etal-2018-simulating",[9777,9779,9782,9783],{"paper_id":9770,"author_seq":247,"given_name":7852,"surname":9778,"affiliation":63,"orcid":63},"Simonnet",{"paper_id":9770,"author_seq":232,"given_name":9780,"surname":9781,"affiliation":63,"orcid":63},"Sahar","Ghannay",{"paper_id":9770,"author_seq":218,"given_name":6874,"surname":6875,"affiliation":63,"orcid":63},{"paper_id":9770,"author_seq":203,"given_name":6885,"surname":6886,"affiliation":63,"orcid":63},"This paper presents an approach to simulate automatic speech recognition (ASR) errors from manual transcriptions and describes how it can be used to improve the performance of spoken language understanding (SLU) systems. In particular, we point out that this noising process is very usefull to obtain a more robust SLU system to ASR errors in case of insufficient training data or more if ASR transcriptions are not available during the training of the SLU model. The proposed method is based on the use of both acoustic and linguistic word embeddings in order to define a similarity measure between words dedicated to predict ASR confusions. Actually, we assume that words acoustically and linguistically close are the ones confused by an ASR system. By using this similarity measure in order to randomly substitute correct words by potentially confusing words in manual annotations used to train CRF- or neural- based SLU systems, we augment the training corpus with these new noisy data. Experiments were carried on the French MEDIA corpus focusing on hotel reservation. They show that this approach significantly improves SLU system performance with a relative reduction of 21.2% of concept\u002Fvalue error rate (CVER), particularly when the SLU system is based on a neural approach (reduction of 22.4% of CVER). A comparison to a naive noising approach shows that the proposed noising approach is particularly relevant.",{"paper_id":9786,"title":9787,"year":81,"month":855,"day":63,"doi":9788,"resource_url":9789,"first_page":63,"last_page":63,"pdf_url":9790,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9791,"paper_type":860,"authors":9792,"abstract":9796},"lrec2018-main-500","Evaluation of Feature-Space Speaker Adaptation for End-to-End Acoustic Models","10.63317\u002F53skhikabpew","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-500","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F881.pdf","tomashenko-esteve-2018-evaluation",[9793,9795],{"paper_id":9786,"author_seq":247,"given_name":7985,"surname":9794,"affiliation":63,"orcid":63},"Tomashenko",{"paper_id":9786,"author_seq":232,"given_name":6885,"surname":6886,"affiliation":63,"orcid":63},"This paper investigates speaker adaptation techniques for bidirectional long short term memory (BLSTM) recurrent neural network based acoustic models (AMs) trained with the connectionist temporal classification (CTC) objective function. BLSTM-CTC AMs play an important role in end-to-end automatic speech recognition systems. However, there is a lack of research in speaker adaptation algorithms for these models. We explore three different feature-space adaptation approaches for CTC AMs: feature-space maximum linear regression, i-vector based adaptation, and maximum a posteriori adaptation using GMM-derived features. Experimental results on the TED-LIUM corpus demonstrate that speaker adaptation, applied in combination with data augmentation techniques, provides, in an unsupervised adaptation mode, for different test sets, up to 11–20% of relative word error rate reduction over the baseline model built on the raw filter-bank features. In addition, the adaptation behavior is compared for BLSTM-CTC AMs and time-delay neural network AMs trained with the cross-entropy criterion.",{"paper_id":9798,"title":9799,"year":81,"month":855,"day":63,"doi":9800,"resource_url":9801,"first_page":63,"last_page":63,"pdf_url":9802,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9803,"paper_type":860,"authors":9804,"abstract":9811},"lrec2018-main-501","Creating New Language and Voice Components for the Updated MaryTTS Text-to-Speech Synthesis Platform","10.63317\u002F2s9w7jk4ompu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-501","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1045.pdf","steiner-le-maguer-2018-creating",[9805,9808],{"paper_id":9798,"author_seq":247,"given_name":9806,"surname":9807,"affiliation":63,"orcid":63},"Ingmar","Steiner",{"paper_id":9798,"author_seq":232,"given_name":9809,"surname":9810,"affiliation":63,"orcid":63},"Sébastien","Le Maguer","We present a new workflow to create components for the MaryTTS TTS platform, which is popular with researchers and developers, extending it to support new languages and custom synthetic voices.   This workflow replaces the previous toolkit with an efficient, flexible process that leverages modern build automation and cloud-hosted infrastructure.   Moreover, it is compatible with the updated MaryTTS architecture, enabling new features and state-of-the-art paradigms such as synthesis based on DNN.   Like MaryTTS itself, the new tools are FOSS, and promote the use of open data.",{"paper_id":9813,"title":9814,"year":81,"month":855,"day":63,"doi":9815,"resource_url":9816,"first_page":63,"last_page":63,"pdf_url":9817,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9818,"paper_type":860,"authors":9819,"abstract":9826},"lrec2018-main-502","Speech Rate Calculations with Short Utterances: A Study from a Speech-to-Speech, Machine Translation Mediated Map Task","10.63317\u002F3rh5z4mgtirg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-502","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1064.pdf","hayakawa-etal-2018-speech",[9820,9823,9824,9825],{"paper_id":9813,"author_seq":247,"given_name":9821,"surname":9822,"affiliation":63,"orcid":63},"Akira","Hayakawa",{"paper_id":9813,"author_seq":232,"given_name":4210,"surname":4211,"affiliation":63,"orcid":63},{"paper_id":9813,"author_seq":218,"given_name":3195,"surname":3196,"affiliation":63,"orcid":63},{"paper_id":9813,"author_seq":203,"given_name":3201,"surname":3202,"affiliation":63,"orcid":63},"The motivation for this paper is to present a way to verify if an utterance within a corpus is pronounced at a fast or slow pace. An alternative method to the well-known Word-Per-Minute (wpm) method for cases where this approach is not applicable. For long segmentations, such as the full introduction section of a speech or presentation, the measurement of wpm is a viable option. For short comparisons of the same single word or multiple syllables, Syllables-Per-Second (sps) is also a viable option. However, when there are multiple short utterances that are frequent in task oriented dialogues or natural free flowing conversation, such as those of the direct Human-to-Human dialogues of the HCRC Map Task corpus or the computer mediated inter-lingual dialogues of the ILMT-s2s corpus, it becomes difficult to obtain a meaningful value for the utterance speech rate. In this paper we explain the method used to provide a alternative speech rate value to the utterance of the ILMT-s2s corpus and the HCRC Map Task corpus.",{"paper_id":9828,"title":9829,"year":81,"month":855,"day":63,"doi":9830,"resource_url":9831,"first_page":63,"last_page":63,"pdf_url":9832,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9833,"paper_type":860,"authors":9834,"abstract":9844},"lrec2018-main-503","Beyond Generic Summarization: A Multi-faceted Hierarchical Summarization Corpus of Large Heterogeneous Data","10.63317\u002F4kkijb3gs8mg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-503","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F252.pdf","tauchmann-etal-2018-beyond",[9835,9837,9838,9840,9843],{"paper_id":9828,"author_seq":247,"given_name":1359,"surname":9836,"affiliation":63,"orcid":63},"Tauchmann",{"paper_id":9828,"author_seq":232,"given_name":1615,"surname":9013,"affiliation":63,"orcid":63},{"paper_id":9828,"author_seq":218,"given_name":3238,"surname":9839,"affiliation":63,"orcid":63},"Hanselowski",{"paper_id":9828,"author_seq":203,"given_name":9841,"surname":9842,"affiliation":63,"orcid":63},"Christian M.","Meyer",{"paper_id":9828,"author_seq":188,"given_name":9108,"surname":9109,"affiliation":63,"orcid":63},"Automatic summarization has so far focused on datasets of ten to twenty rather short documents, typically news articles. But automatic systems could in theory analyze hundreds of documents from a wide range of sources and provide an overview to the interested reader. Such a summary would ideally present the most general issues of a given topic and allow for more in-depth information on specific aspects within said topic.  In this paper, we present a new approach for creating hierarchical summarization corpora from large, heterogeneous document collections. We first extract relevant content using crowdsourcing and then ask trained annotators to order the relevant information hierarchically. This yields tree structures covering the specific facets discussed in a document collection. Our resulting corpus is freely available and can be used to develop and evaluate hierarchical summarization systems.",{"paper_id":9846,"title":9847,"year":81,"month":855,"day":63,"doi":9848,"resource_url":9849,"first_page":63,"last_page":63,"pdf_url":9850,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9851,"paper_type":860,"authors":9852,"abstract":9864},"lrec2018-main-504","A New Annotated Portuguese\u002FSpanish Corpus for the Multi-Sentence Compression Task","10.63317\u002F3tat2guzqfur","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-504","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F275.pdf","linhares-pontes-etal-2018-new",[9853,9856,9859,9861],{"paper_id":9846,"author_seq":247,"given_name":9854,"surname":9855,"affiliation":63,"orcid":63},"Elvys","Linhares Pontes",{"paper_id":9846,"author_seq":232,"given_name":9857,"surname":9858,"affiliation":63,"orcid":63},"Juan-Manuel","Torres-Moreno",{"paper_id":9846,"author_seq":218,"given_name":6383,"surname":9860,"affiliation":63,"orcid":63},"Huet",{"paper_id":9846,"author_seq":203,"given_name":9862,"surname":9863,"affiliation":63,"orcid":63},"Andréa Carneiro","Linhares","Multi-sentence compression aims to generate a short and informative compression from several source sentences that deal with the same topic. In this work, we present a new corpus for the Multi-Sentence Compression (MSC) task in Portuguese and Spanish. We also provide on this corpus a comparison of two state-of-the-art MSC systems.",{"paper_id":9866,"title":9867,"year":81,"month":855,"day":63,"doi":9868,"resource_url":9869,"first_page":63,"last_page":63,"pdf_url":9870,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9871,"paper_type":860,"authors":9872,"abstract":9880},"lrec2018-main-505","Live Blog Corpus for Summarization","10.63317\u002F3j5mnes948iz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-505","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F317.pdf","p-v-s-etal-2018-live",[9873,9876,9879],{"paper_id":9866,"author_seq":247,"given_name":9874,"surname":9875,"affiliation":63,"orcid":63},"Avinesh","P.V.S.",{"paper_id":9866,"author_seq":232,"given_name":9877,"surname":9878,"affiliation":63,"orcid":63},"Maxime","Peyrard",{"paper_id":9866,"author_seq":218,"given_name":9841,"surname":9842,"affiliation":63,"orcid":63},"Live blogs are an increasingly popular news format to cover breaking news and live events in online journalism. Online news websites around the world are using this medium to give their readers a minute by minute update on an event. Good summaries enhance the value of the live blogs for a reader but are often not available. In this paper, we study a way of collecting corpora for automatic live blog summarization. In an empirical evaluation using well-known state-of-the-art summarization systems, we show that live blogs corpus poses new challenges in the field of summarization. We make our tools publicly available to reconstruct the corpus to encourage the research community and replicate our results.",{"paper_id":9882,"title":9883,"year":81,"month":855,"day":63,"doi":9884,"resource_url":9885,"first_page":63,"last_page":63,"pdf_url":9886,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9887,"paper_type":860,"authors":9888,"abstract":9898},"lrec2018-main-506","TSix: A Human-involved-creation Dataset for Tweet Summarization","10.63317\u002F39q4zq3o8xmu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-506","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F516.pdf","nguyen-etal-2018-tsix",[9889,9891,9894,9896],{"paper_id":9882,"author_seq":247,"given_name":9890,"surname":1724,"affiliation":63,"orcid":63},"Minh-Tien",{"paper_id":9882,"author_seq":232,"given_name":9892,"surname":9893,"affiliation":63,"orcid":63},"Dac Viet","Lai",{"paper_id":9882,"author_seq":218,"given_name":9895,"surname":1724,"affiliation":63,"orcid":63},"Huy-Tien",{"paper_id":9882,"author_seq":203,"given_name":9897,"surname":1724,"affiliation":63,"orcid":63},"Le-Minh","We present a new dataset for tweet summarization. The dataset includes six events collected from Twitter from October 10 to November 9, 2016. Our dataset features two prominent properties. Firstly, human-annotated gold-standard references allow to correctly evaluate extractive summarization methods. Secondly, tweets are assigned into sub-topics divided by consecutive days, which facilitate incremental tweet stream summarization methods. To reveal the potential usefulness of our dataset, we compare several well-known summarization methods. Experimental results indicate that among extractive approaches, hybrid term frequency -- document term frequency obtains competitive results in term of ROUGE-scores. The analysis also shows that polarity is an implicit factor of tweets in our dataset, suggesting that it can be exploited as a component besides tweet content quality in the summarization process.",{"paper_id":9900,"title":9901,"year":81,"month":855,"day":63,"doi":9902,"resource_url":9903,"first_page":63,"last_page":63,"pdf_url":9904,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9905,"paper_type":860,"authors":9906,"abstract":9914},"lrec2018-main-507","A Workbench for Rapid Generation of Cross-Lingual Summaries","10.63317\u002F5aj2xv4kj8ig","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-507","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F576.pdf","jhaveri-etal-2018-workbench",[9907,9910,9911],{"paper_id":9900,"author_seq":247,"given_name":9908,"surname":9909,"affiliation":63,"orcid":63},"Nisarg","Jhaveri",{"paper_id":9900,"author_seq":232,"given_name":4429,"surname":2603,"affiliation":63,"orcid":63},{"paper_id":9900,"author_seq":218,"given_name":9912,"surname":9913,"affiliation":63,"orcid":63},"Vasudeva","Varma","The need for cross-lingual information access is more than ever with the easy accessibility to the Internet, especially in vastly multilingual societies like India. Cross-lingual summarization can help minimize human effort needed for achieving publishable articles in multiple languages, while making the most important information available in target language in the form of summaries. We describe a flexible, web-based tool for human editing of cross-lingual summaries to rapidly generate publishable summaries in a number of Indian Languages for news articles originally published in English, and simultaneously collect detailed logs about the process, at both article and sentence level. Similar to translation post-editing logs, such logs can be used to evaluate the automated cross-lingual summaries, in terms of effort needed to make them publishable. The generated summaries along with the logs can be used to train and improve the automatic system over time.",{"paper_id":9916,"title":9917,"year":81,"month":855,"day":63,"doi":9918,"resource_url":9919,"first_page":63,"last_page":63,"pdf_url":9920,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9921,"paper_type":860,"authors":9922,"abstract":9929},"lrec2018-main-508","Annotation and Analysis of Extractive Summaries for the Kyutech Corpus","10.63317\u002F2fbainzoug46","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-508","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F849.pdf","yamamura-shimada-2018-annotation",[9923,9926],{"paper_id":9916,"author_seq":247,"given_name":9924,"surname":9925,"affiliation":63,"orcid":63},"Takashi","Yamamura",{"paper_id":9916,"author_seq":232,"given_name":9927,"surname":9928,"affiliation":63,"orcid":63},"Kazutaka","Shimada","Summarization of multi-party conversation is one of the important tasks in natural language processing. For conversation summarization tasks, corpora have an important role to analyze characteristics of conversations and to construct a method for summary generation. We are developing a freely available Japanese conversation corpus for a decision-making task. We call it the Kyutech corpus. The current version of the Kyutech corpus contains topic tags of each utterance and reference summaries of each conversation. In this paper, we explain an annotation task of extractive summaries. In the annotation task, we annotate an importance tag for each utterance and link utterances with sentences in reference summaries that already exist in the Kyutech corpus. By using the annotated extractive summaries, we can evaluate extractive summarization methods on the Kyutech corpus. In the experiment, we compare some methods based on machine learning techniques with some features.",{"paper_id":9931,"title":9932,"year":81,"month":855,"day":63,"doi":9933,"resource_url":9934,"first_page":63,"last_page":63,"pdf_url":9935,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9936,"paper_type":860,"authors":9937,"abstract":9944},"lrec2018-main-509","A Repository of Corpora for Summarization","10.63317\u002F2xvghosmyh3a","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-509","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F889.pdf","dernoncourt-etal-2018-repository",[9938,9941,9943],{"paper_id":9931,"author_seq":247,"given_name":9939,"surname":9940,"affiliation":63,"orcid":63},"Franck","Dernoncourt",{"paper_id":9931,"author_seq":232,"given_name":1437,"surname":9942,"affiliation":63,"orcid":63},"Ghassemi",{"paper_id":9931,"author_seq":218,"given_name":8558,"surname":2069,"affiliation":63,"orcid":63},"Summarization corpora are numerous but fragmented, making it challenging for researchers to efficiently pinpoint corpora most suited to a given summarization task. In this paper, we introduce a repository containing corpora  available to train and evaluate automatic summarization systems. We also present an overview of the main corpora with respect to the different summarization tasks, and identify various corpus parameters that researchers may want to consider when choosing a corpus. Lastly, as the recent successes of artificial neural networks for summarization have renewed the interest in creating large-scale corpora for summarization, we survey which corpora are used in neural network research studies. We come to the conclusion that more large-scale corpora for summarization are needed. Furthermore, each corpus is organized differently, which makes it time-consuming for researchers to experiment a new summarization algorithm on many corpora, and as a result studies typically use one or very few corpora. Agreeing on a data standard for summarization corpora would be beneficial to the field.",{"paper_id":9946,"title":9947,"year":81,"month":855,"day":63,"doi":9948,"resource_url":9949,"first_page":63,"last_page":63,"pdf_url":9950,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9951,"paper_type":860,"authors":9952,"abstract":9955},"lrec2018-main-510","Auto-hMDS: Automatic Construction of a Large Heterogeneous Multilingual Multi-Document Summarization Corpus","10.63317\u002F5d97p2k2jt8p","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-510","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1018.pdf","zopf-2018-auto",[9953],{"paper_id":9946,"author_seq":247,"given_name":1771,"surname":9954,"affiliation":63,"orcid":63},"Zopf","Automatic text summarization is a challenging natural language processing (NLP) task which has been researched for several decades. The available datasets for multi-document summarization (MDS) are, however, rather small and usually focused on the newswire genre. Nowadays, machine learning methods are applied to more and more NLP problems such as machine translation, question answering, and single-document summarization. Modern machine learning methods such as neural networks require large training datasets which are available for the three tasks but not yet for MDS. This lack of training data limits the development of machine learning methods for MDS. In this work, we automatically generate a large heterogeneous multilingual multi-document summarization corpus. The key idea is to use Wikipedia articles as summaries and to automatically search for appropriate source documents. We created a corpus with 7,316 topics in English and German, which has variing summary lengths and variing number of source documents. More information about the corpus can be found at the corpus GitHub page at https:\u002F\u002Fgithub.com\u002FAIPHES\u002Fauto-hMDS.",{"paper_id":9957,"title":9958,"year":81,"month":855,"day":63,"doi":9959,"resource_url":9960,"first_page":63,"last_page":63,"pdf_url":9961,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9962,"paper_type":860,"authors":9963,"abstract":9972},"lrec2018-main-511","PyrEval: An Automated Method for Summary Content Analysis","10.63317\u002F5as4tt3gq2p2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-511","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1096.pdf","gao-etal-2018-pyreval",[9964,9967,9969],{"paper_id":9957,"author_seq":247,"given_name":9965,"surname":9966,"affiliation":63,"orcid":63},"Yanjun","Gao",{"paper_id":9957,"author_seq":232,"given_name":1731,"surname":9968,"affiliation":63,"orcid":63},"Warner",{"paper_id":9957,"author_seq":218,"given_name":9970,"surname":9971,"affiliation":63,"orcid":63},"Rebecca","Passonneau","Pyramid method is an existing content analysis approach in automatic summarization evaluation for manual construction of a pyramid content model from reference summaries, and manual scoring of the target summaries with the pyramid model. PyrEval assesses the content of automatic summarization by automating the manual pyramid method. PyrEval uses low-dimension distributional semantics to represent phrase meanings, and a new algorithm, EDUA (Emergent Discoveries of Units of Attractions), for solving set packing problem in construction of content model from vectorized phrases. Because the vectors are pretrained, and EDUA is an efficient greedy algorithm, PyrEval can replace manual pyramid with no retraining, and is very efficient. Moreover, PyrEval has been tested on many datasets derived from humans and machine translated summaries and shown good performance on both.",{"paper_id":9974,"title":9975,"year":81,"month":855,"day":63,"doi":9976,"resource_url":9977,"first_page":63,"last_page":63,"pdf_url":9978,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9979,"paper_type":860,"authors":9980,"abstract":9989},"lrec2018-main-512","Mapping Texts to Scripts: An Entailment Study","10.63317\u002F4j7e29ocouzr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-512","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F212.pdf","ostermann-etal-2018-mapping",[9981,9983,9986,9988],{"paper_id":9974,"author_seq":247,"given_name":1269,"surname":9982,"affiliation":63,"orcid":63},"Ostermann",{"paper_id":9974,"author_seq":232,"given_name":9984,"surname":9985,"affiliation":63,"orcid":63},"Hannah","Seitz",{"paper_id":9974,"author_seq":218,"given_name":1471,"surname":9987,"affiliation":63,"orcid":63},"Thater",{"paper_id":9974,"author_seq":203,"given_name":1058,"surname":1059,"affiliation":63,"orcid":63},"Commonsense knowledge as provided by scripts is crucially relevant for text understanding systems, providing a basis for commonsense inference. This paper considers a relevant subtask of script-based text understanding, the task of mapping event mentions in a text to script events. We focus on script representations where events are associated with paraphrase sets, i.e. sets of crowdsourced event descriptions. We provide a detailed annotation of event mention\u002Fdescription pairs with textual entailment types. We demonstrate that representing events in terms of paraphrase sets can massively improve the performance of text-to-script mapping systems. However, for a residual substantial fraction of cases, deeper inference is still required.",{"paper_id":9991,"title":9992,"year":81,"month":855,"day":63,"doi":9993,"resource_url":9994,"first_page":63,"last_page":63,"pdf_url":9995,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":9996,"paper_type":860,"authors":9997,"abstract":10004},"lrec2018-main-513","Semantic Equivalence Detection: Are Interrogatives Harder than Declaratives?","10.63317\u002F337qsjbk3jsg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-513","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F228.pdf","rodrigues-etal-2018-semantic",[9998,9999,10002,10003],{"paper_id":9991,"author_seq":247,"given_name":3884,"surname":4507,"affiliation":63,"orcid":63},{"paper_id":9991,"author_seq":232,"given_name":10000,"surname":10001,"affiliation":63,"orcid":63},"Chakaveh","Saedi",{"paper_id":9991,"author_seq":218,"given_name":1332,"surname":1333,"affiliation":63,"orcid":63},{"paper_id":9991,"author_seq":203,"given_name":3884,"surname":4259,"affiliation":63,"orcid":63},"Duplicate Question Detection (DQD) is a Natural Language Processing task under active research, with applications to fields like Community Question Answering and Information Retrieval. While DQD falls under the umbrella of Semantic Text Similarity (STS), these are often not seen as similar tasks of semantic equivalence detection, with STS being implicitly understood as concerning only declarative sentences. Nevertheless, approaches to STS have been applied to DQD and paraphrase detection, that is to interrogatives and declaratives, alike. We present a study that seeks to assess, under conditions of comparability, the possible different performance of state-of-the-art approaches to STS over different types of textual segments, including most notably declaratives and interrogatives.  This paper contributes to a better understanding of current mainstream methods for semantic equivalence detection, and to a better appreciation of the different results reported in the literature when these are obtained from different data sets with different types of textual segments. Importantly, it contributes also with results concerning how data sets containing textual segments of a certain type can be used to leverage the performance of resolvers for segments of other types.",{"paper_id":10006,"title":10007,"year":81,"month":855,"day":63,"doi":10008,"resource_url":10009,"first_page":63,"last_page":63,"pdf_url":10010,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10011,"paper_type":860,"authors":10012,"abstract":10019},"lrec2018-main-514","CEFR-based Lexical Simplification Dataset","10.63317\u002F5avad39d7h9w","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-514","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F238.pdf","uchida-etal-2018-cefr",[10013,10015,10018],{"paper_id":10006,"author_seq":247,"given_name":10014,"surname":7303,"affiliation":63,"orcid":63},"Satoru",{"paper_id":10006,"author_seq":232,"given_name":10016,"surname":10017,"affiliation":63,"orcid":63},"Shohei","Takada",{"paper_id":10006,"author_seq":218,"given_name":4952,"surname":4953,"affiliation":63,"orcid":63},"This study creates a language dataset for lexical simplification based on Common European Framework of References for Languages (CEFR) levels (CEFR-LS). Lexical simplification has continued to be one of the important tasks for language learning and education. There are several language resources for lexical simplification that are available for generating rules and creating simplifiers using machine learning. However, these resources are not tailored to language education with word levels and lists of candidates tending to be subjective. Different from these, the present study constructs a CEFR-LS whose target and candidate words are assigned CEFR levels using CEFR-J wordlists and English Vocabulary Profile, and candidates are selected using an online thesaurus. Since CEFR is widely used around the world, using CEFR levels makes it possible to apply a simplification method based on our dataset to language education directly. CEFR-LS currently includes 406 targets and 4912 candidates. To evaluate the validity of CEFR-LS for machine learning, two basic models are employed for selecting candidates and the results are presented as a reference for future users of the dataset.",{"paper_id":10021,"title":10022,"year":81,"month":855,"day":63,"doi":10023,"resource_url":10024,"first_page":63,"last_page":63,"pdf_url":10025,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10026,"paper_type":860,"authors":10027,"abstract":10041},"lrec2018-main-515","CLARIN: Towards FAIR and Responsible Data Science Using Language Resources","10.63317\u002F2dvrmm3635mu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-515","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F575.pdf","de-jong-etal-2018-clarin",[10028,10031,10034,10037,10038],{"paper_id":10021,"author_seq":247,"given_name":10029,"surname":10030,"affiliation":63,"orcid":63},"Franciska","de Jong",{"paper_id":10021,"author_seq":232,"given_name":10032,"surname":10033,"affiliation":63,"orcid":63},"Bente","Maegaard",{"paper_id":10021,"author_seq":218,"given_name":10035,"surname":10036,"affiliation":63,"orcid":63},"Koenraad","De Smedt",{"paper_id":10021,"author_seq":203,"given_name":4759,"surname":4760,"affiliation":63,"orcid":63},{"paper_id":10021,"author_seq":188,"given_name":10039,"surname":10040,"affiliation":63,"orcid":63},"Dieter","Van Uytvanck","CLARIN is a European Research Infrastructure providing access to language resources and technologies for researchers in the humanities and social sciences. It supports the study of language data in general and aims to increase the potential for comparative research of cultural and societal phenomena across the boundaries of languages. This paper outlines the CLARIN vision and strategy, and it explains how the design and implementation of CLARIN are compliant with the FAIR principles: findability, accessibility, interoperability and re- usability of data. The paper also explains the approach of CLARIN towards the enabling of responsible data science. Attention is paid to (i) the development of measures for increasing the transparency and explainability of the results from applying CLARIN technologies, in particular in the context of multidisciplinary research, and (ii) stimulating the uptake of its resources, tools and services by the various communities of use, all in accordance with the principles for Open Science.",{"paper_id":10043,"title":10044,"year":81,"month":855,"day":63,"doi":10045,"resource_url":10046,"first_page":63,"last_page":63,"pdf_url":10047,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10048,"paper_type":860,"authors":10049,"abstract":10059},"lrec2018-main-516","From ‘Solved Problems’ to New Challenges: A Report on LDC Activities","10.63317\u002F56bbwxc9bx3r","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-516","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F775.pdf","cieri-etal-2018-solved",[10050,10051,10052,10053,10056,10057],{"paper_id":10043,"author_seq":247,"given_name":1359,"surname":1360,"affiliation":63,"orcid":63},{"paper_id":10043,"author_seq":232,"given_name":1364,"surname":1365,"affiliation":63,"orcid":63},{"paper_id":10043,"author_seq":218,"given_name":1205,"surname":5377,"affiliation":63,"orcid":63},{"paper_id":10043,"author_seq":203,"given_name":10054,"surname":10055,"affiliation":63,"orcid":63},"Denise","DiPersio",{"paper_id":10043,"author_seq":188,"given_name":1370,"surname":1371,"affiliation":63,"orcid":63},{"paper_id":10043,"author_seq":172,"given_name":2516,"surname":10058,"affiliation":63,"orcid":63},"Mazzucchi","This paper reports on the activities of the Linguistic Data Consortium, the next in a sequence of such data center reports included in each LREC meeting. This report begins by sketching the changing demands for Language Resources driven by the spread of Human Language Technologies throughout the market. One result of the successful deployment of HLT enabled applications is increased demand in ever more languages. This in turn places pressure on data centers to collaborate and form global networks in order to meet the demand for LRs of increasing complexity and linguistic diversity. The report next summarizes the over 100 Language Resources released since the last report, many of which have been contributed by research groups around the world. It also covers advances in Consortium infrastructure that assure the integrity of published data sets and support future collection and annotation. Finally, it discusses recent and current LR creation activities that lead to new LR publications followed by data related research activities particularly in clinical applications.",{"paper_id":10061,"title":10062,"year":81,"month":855,"day":63,"doi":10063,"resource_url":10064,"first_page":63,"last_page":63,"pdf_url":10065,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10066,"paper_type":860,"authors":10067,"abstract":10077},"lrec2018-main-517","New directions in ELRA activities","10.63317\u002F5kpno2gnnhoo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-517","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F510.pdf","mapelli-etal-2018-new",[10068,10069,10072,10075,10076],{"paper_id":10061,"author_seq":247,"given_name":1317,"surname":1318,"affiliation":63,"orcid":63},{"paper_id":10061,"author_seq":232,"given_name":10070,"surname":10071,"affiliation":63,"orcid":63},"Victoria","Arranz",{"paper_id":10061,"author_seq":218,"given_name":10073,"surname":10074,"affiliation":63,"orcid":63},"Hélène","Mazo",{"paper_id":10061,"author_seq":203,"given_name":1314,"surname":1315,"affiliation":63,"orcid":63},{"paper_id":10061,"author_seq":188,"given_name":4407,"surname":1567,"affiliation":63,"orcid":63},"Beyond the generic activities (cataloguing, producing, distribution of Language Resources, dissemination of information, etc.) that make the overall ELRA mission an indispensable middle-man in the field of Language Resources (LRs), new directions of work are now being undertaken so as to answer the needs of this ever-moving community. This impacts the structure and the operating model of the association per se with the creation of a new technical committee dealing with Less-Resourced Languages and the modification of the ELRA membership policy. It also intrinsically impacts the axes of work at all steps of activities: offering new tools for sharing LRs and related information, adapting to new legal requirements, producing and offering field-specific data. This paper addresses these new directions and describes ELRA (and its operational body ELDA) regular activities updates. Future activities are also reported in the last part of the article. They consist in ongoing projects like the ELRC initiative, the start of another CEF-funded project, the European Language Resource Infrastructure (ELRI), the updating of the Review of existing Language Resources for languages of France, and the continuation of the ELRA Catalogue development.",{"paper_id":10079,"title":10080,"year":81,"month":855,"day":63,"doi":10081,"resource_url":10082,"first_page":63,"last_page":63,"pdf_url":10083,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10084,"paper_type":860,"authors":10085,"abstract":10093},"lrec2018-main-518","A Framework for Multi-Language Service Design with the Language Grid","10.63317\u002F59w9ggkxy3qh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-518","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F794.pdf","lin-etal-2018-framework",[10086,10088,10090],{"paper_id":10079,"author_seq":247,"given_name":10087,"surname":9639,"affiliation":63,"orcid":63},"Donghui",{"paper_id":10079,"author_seq":232,"given_name":4124,"surname":10089,"affiliation":63,"orcid":63},"Murakami",{"paper_id":10079,"author_seq":218,"given_name":10091,"surname":10092,"affiliation":63,"orcid":63},"Toru","Ishida","To collect and share language resources like machine translators and dictionaries, we developed the Language Grid in 2006, a service-oriented language infrastructure on the Internet. Although we have put a lot of effort into improving the service grid technologies and collecting language services, international NPO\u002FNGOs are struggling with the design and development of tools and systems for supporting multi-language communication in the real world by utilizing available language services. This paper proposes a framework for service design with the Language Grid by bridging the gap between language service infrastructures and multi-language systems. The proposed framework is implemented as a toolkit, Multilingual Studio, which is open to allow the users to design and develop multilingual communication services and tools in the real world.",{"paper_id":10095,"title":10096,"year":81,"month":855,"day":63,"doi":10097,"resource_url":10098,"first_page":63,"last_page":63,"pdf_url":10099,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10100,"paper_type":860,"authors":10101,"abstract":10106},"lrec2018-main-519","Language Technology for Multilingual Europe: An Analysis of a Large-Scale Survey regarding Challenges, Demands, Gaps and Needs","10.63317\u002F3ktp9rzk25os","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-519","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F815.pdf","rehm-hegele-2018-language",[10102,10103],{"paper_id":10095,"author_seq":247,"given_name":7813,"surname":7814,"affiliation":63,"orcid":63},{"paper_id":10095,"author_seq":232,"given_name":10104,"surname":10105,"affiliation":63,"orcid":63},"Stefanie","Hegele","We present the analysis of a large-scale survey titled “Language Technology for Multilingual Europe”, conducted between May and June 2017. A total of 634 participants in 52 countries responded to the survey. Its main purpose was to collect input, feedback and ideas from the European Language Technology research and innovation community in order to assess the most prominent research areas, projects and applications, but, more importantly to identify the biggest challenges, obstacles and gaps Europe is currently facing with regard to its multilingual setup and technological solutions. Participants were encouraged to share concrete suggestions and recommendations on how present challenges can be turned into opportunities in the context of a potential long-term, large-scale, Europe-wide research, development and innovation funding programme, currently titled Human Language Project.",{"paper_id":10108,"title":10109,"year":81,"month":855,"day":63,"doi":10110,"resource_url":10111,"first_page":63,"last_page":63,"pdf_url":10112,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10113,"paper_type":860,"authors":10114,"abstract":10124},"lrec2018-main-520","Annotating High-Level Structures of Short Stories and Personal Anecdotes","10.63317\u002F2u4eanbonucy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-520","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F209.pdf","li-etal-2018-annotating",[10115,10117,10120,10122],{"paper_id":10108,"author_seq":247,"given_name":10116,"surname":1591,"affiliation":63,"orcid":63},"Boyang",{"paper_id":10108,"author_seq":232,"given_name":10118,"surname":10119,"affiliation":63,"orcid":63},"Beth","Cardier",{"paper_id":10108,"author_seq":218,"given_name":10121,"surname":1588,"affiliation":63,"orcid":63},"Tong",{"paper_id":10108,"author_seq":203,"given_name":6048,"surname":10123,"affiliation":63,"orcid":63},"Metze","Stories are a vital form of communication in human culture; they are employed daily to persuade, to elicit sympathy, or to convey a message. Computational understanding of human narratives, especially high-level narrative structures, remain limited to date. Multiple literary theories for narrative structures exist, but operationalization of the theories has remained a challenge. We developed an annotation scheme by consolidating and extending existing narratological theories, including Labov and Waletsky’s (1967) functional categorization scheme and Freytag’s (1863) pyramid of dramatic tension, and present 360 annotated short stories collected from online sources. In the future, this research will support an approach that enables systems to intelligently sustain complex communications with humans.",{"paper_id":10126,"title":10127,"year":81,"month":855,"day":63,"doi":10128,"resource_url":10129,"first_page":63,"last_page":63,"pdf_url":10130,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10131,"paper_type":860,"authors":10132,"abstract":10142},"lrec2018-main-521","Discovering the Language of Wine Reviews: A Text Mining Account","10.63317\u002F4fwq7gsn5byt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-521","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F496.pdf","lefever-etal-2018-discovering",[10133,10134,10135,10138,10139],{"paper_id":10126,"author_seq":247,"given_name":6095,"surname":6096,"affiliation":63,"orcid":63},{"paper_id":10126,"author_seq":232,"given_name":2283,"surname":2284,"affiliation":63,"orcid":63},{"paper_id":10126,"author_seq":218,"given_name":10136,"surname":10137,"affiliation":63,"orcid":63},"Ilja","Croijmans",{"paper_id":10126,"author_seq":203,"given_name":2317,"surname":2318,"affiliation":63,"orcid":63},{"paper_id":10126,"author_seq":188,"given_name":10140,"surname":10141,"affiliation":63,"orcid":63},"Asifa","Majid","It is widely held that smells and flavors are impossible to put into words. In this paper we test this claim by seeking predictive patterns in wine reviews, which ostensibly aim to provide guides to perceptual content. Wine reviews have previously been critiqued as random and meaningless.  We collected an English corpus of wine reviews with their structured metadata, and applied machine learning techniques to automatically predict the wine's color, grape variety, and country of origin. To train the three supervised classifiers, three different information sources were incorporated: lexical bag-of-words features, domain-specific terminology features, and semantic word embedding features. In addition, using regression analysis we investigated basic review properties, i.e., review length, average word length, and their relationship to the scalar values of price and review score. Our results show that wine experts do share a common vocabulary to describe wines and they use this in a consistent way, which makes it possible to automatically predict wine characteristics based on the review text alone. This means that odors and flavors may be more expressible in language than typically acknowledged.",{"paper_id":10144,"title":10145,"year":81,"month":855,"day":63,"doi":10146,"resource_url":10147,"first_page":63,"last_page":63,"pdf_url":10148,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10149,"paper_type":860,"authors":10150,"abstract":10157},"lrec2018-main-522","Toward An Epic Epigraph Graph","10.63317\u002F48bs63isddye","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-522","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F610.pdf","bond-matthews-2018-toward",[10151,10154],{"paper_id":10144,"author_seq":247,"given_name":10152,"surname":10153,"affiliation":63,"orcid":63},"Francis","Bond",{"paper_id":10144,"author_seq":232,"given_name":10155,"surname":10156,"affiliation":63,"orcid":63},"Graham","Matthews","We present a database of epigraphs collected with the goal of revealing literary influence as a set of connections between authors over time.  We have collected epigraphs from over 12,000 literary works and are in the process of identifying their provenance.  The database is released under an open license.",{"paper_id":10159,"title":10160,"year":81,"month":855,"day":63,"doi":10161,"resource_url":10162,"first_page":63,"last_page":63,"pdf_url":10163,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10164,"paper_type":860,"authors":10165,"abstract":10178},"lrec2018-main-523","Delta vs. N-Gram Tracing: Evaluating the Robustness of Authorship Attribution Methods","10.63317\u002F4u4qcrjt5isy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-523","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F835.pdf","proisl-etal-2018-delta",[10166,10167,10169,10172,10174,10176],{"paper_id":10159,"author_seq":247,"given_name":1615,"surname":2964,"affiliation":63,"orcid":63},{"paper_id":10159,"author_seq":232,"given_name":1471,"surname":10168,"affiliation":63,"orcid":63},"Evert",{"paper_id":10159,"author_seq":218,"given_name":10170,"surname":10171,"affiliation":63,"orcid":63},"Fotis","Jannidis",{"paper_id":10159,"author_seq":203,"given_name":3669,"surname":10173,"affiliation":63,"orcid":63},"Schöch",{"paper_id":10159,"author_seq":188,"given_name":5635,"surname":10175,"affiliation":63,"orcid":63},"Konle",{"paper_id":10159,"author_seq":172,"given_name":3996,"surname":10177,"affiliation":63,"orcid":63},"Pielström","Delta measures are a well-established and popular family of authorship attribution methods, especially for literary texts. N-gram tracing is a novel method for authorship attribution designed for very short texts, which has its roots in forensic linguistics. We evaluate the performance of both methods in a series of experiments on English, French and German literary texts, in order to investigate the relationship between authorship attribution accuracy and text length as well as the composition of the comparison corpus. Our results show that, at least in our setting, both methods require relatively long texts and are furthermore highly sensitive to the choice of authors and texts in the comparison corpus.",{"paper_id":10180,"title":10181,"year":81,"month":855,"day":63,"doi":10182,"resource_url":10183,"first_page":63,"last_page":63,"pdf_url":10184,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10185,"paper_type":860,"authors":10186,"abstract":10195},"lrec2018-main-524","An Attribution Relations Corpus for Political News","10.63317\u002F2ragacqgsct6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-524","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1051.pdf","newell-etal-2018-attribution",[10187,10189,10192],{"paper_id":10180,"author_seq":247,"given_name":1073,"surname":10188,"affiliation":63,"orcid":63},"Newell",{"paper_id":10180,"author_seq":232,"given_name":10190,"surname":10191,"affiliation":63,"orcid":63},"Drew","Margolin",{"paper_id":10180,"author_seq":218,"given_name":10193,"surname":10194,"affiliation":63,"orcid":63},"Derek","Ruths","An attribution occurs when an author quotes, paraphrases, or describes the statements and private states of a third party. Journalists use attribution to report statements and attitudes of public figures, organizations, and ordinary individuals. Properly recognizing attributions in context is an essential aspect of natural language understanding and implicated in many NLP tasks, but current resources are limited in size and completeness. We introduce the Political News Attribution Relations Corpus 2016 (PolNeAR)---the largest, most complete attribution relations corpus to date. This dataset greatly increases the volume of high-quality attribution annotations, addresses shortcomings of existing resources, and expands the diversity of publishers sourced. PolNeAR is built on news articles covering the political candidates during the year leading up to US Presidential Election in November of 2016. The dataset will support the creation of sophisticated end-to-end solutions for attribution extraction and invite interdisciplinary collaboration between the NLP, communications, political science, and journalism communities. Along with the dataset we contribute revised guidelines aimed at improving clarity and consistency in the annotation task, and an annotation interface specially adapted to the task, for reproduction or extension of this work",{"paper_id":10197,"title":10198,"year":81,"month":855,"day":63,"doi":10199,"resource_url":10200,"first_page":63,"last_page":63,"pdf_url":10201,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10202,"paper_type":860,"authors":10203,"abstract":10225},"lrec2018-main-525","Face2Text: Collecting an Annotated Image Description Corpus for the Generation of Rich Face Descriptions","10.63317\u002F2b8fiqsdwe99","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-525","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F226.pdf","gatt-etal-2018-face2text",[10204,10206,10208,10210,10211,10214,10217,10220,10222],{"paper_id":10197,"author_seq":247,"given_name":1685,"surname":10205,"affiliation":63,"orcid":63},"Gatt",{"paper_id":10197,"author_seq":232,"given_name":4664,"surname":10207,"affiliation":63,"orcid":63},"Tanti",{"paper_id":10197,"author_seq":218,"given_name":1676,"surname":10209,"affiliation":63,"orcid":63},"Muscat",{"paper_id":10197,"author_seq":203,"given_name":2999,"surname":3000,"affiliation":63,"orcid":63},{"paper_id":10197,"author_seq":188,"given_name":10212,"surname":10213,"affiliation":63,"orcid":63},"Reuben A","Farrugia",{"paper_id":10197,"author_seq":172,"given_name":10215,"surname":10216,"affiliation":63,"orcid":63},"Claudia","Borg",{"paper_id":10197,"author_seq":155,"given_name":10218,"surname":10219,"affiliation":63,"orcid":63},"Kenneth P","Camilleri",{"paper_id":10197,"author_seq":138,"given_name":1780,"surname":10221,"affiliation":63,"orcid":63},"Rosner",{"paper_id":10197,"author_seq":121,"given_name":10223,"surname":10224,"affiliation":63,"orcid":63},"Lonneke","van der Plas","The past few years have witnessed renewed interest in NLP tasks at the interface between vision and language. One intensively-studied problem is that of automatically generating text from images. In this paper, we extend this problem to the more specific domain of face description. Unlike scene descriptions, face descriptions are more fine-grained and rely on attributes extracted from the image, rather than objects and relations. Given that no data exists for this task, we present an ongoing crowdsourcing study to collect a corpus of descriptions of face images taken ‘in the wild’. To gain a better understanding of the variation we find in face description and the possible issues that this may raise, we also conducted an annotation study on a subset of the corpus. Primarily, we found descriptions to refer to a mixture of attributes, not only physical, but also emotional and inferential, which is bound to create further challenges for current image-to-text methods.",{"paper_id":10227,"title":10228,"year":81,"month":855,"day":63,"doi":10229,"resource_url":10230,"first_page":63,"last_page":63,"pdf_url":10231,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10232,"paper_type":860,"authors":10233,"abstract":10239},"lrec2018-main-526","Adapting Serious Game for Fallacious Argumentation to German: Pitfalls, Insights, and Best Practices","10.63317\u002F2q3judwjab6v","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-526","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F494.pdf","habernal-etal-2018-adapting",[10234,10236,10238],{"paper_id":10227,"author_seq":247,"given_name":3755,"surname":10235,"affiliation":63,"orcid":63},"Habernal",{"paper_id":10227,"author_seq":232,"given_name":2451,"surname":10237,"affiliation":63,"orcid":63},"Pauli",{"paper_id":10227,"author_seq":218,"given_name":4585,"surname":4586,"affiliation":63,"orcid":63},"As argumentation about controversies is culture- and language-dependent, porting a serious game that deals with daily argumentation to another language requires substantial adaptation. This article presents a study of deploying Argotario (serious game for learning argumentation fallacies) in the German context. We examine all steps that are necessary to end up with a successful serious game platform, such as topic selection, initial data creation, or effective campaigns. Moreover, we analyze users' behavior and in-game created data in order to assess the dissemination strategies and qualitative aspects of the resulting corpus. We also report on classification experiments based on neural networks and feature-based models.",{"paper_id":10241,"title":10242,"year":81,"month":855,"day":63,"doi":10243,"resource_url":10244,"first_page":63,"last_page":63,"pdf_url":10245,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10246,"paper_type":860,"authors":10247,"abstract":10258},"lrec2018-main-527","Crowdsourcing Regional Variation Data and Automatic Geolocalisation of Speakers of European French","10.63317\u002F2r7siyan4x8n","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-527","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F517.pdf","goldman-etal-2018-crowdsourcing",[10248,10249,10251,10253,10254,10256],{"paper_id":10241,"author_seq":247,"given_name":5185,"surname":5186,"affiliation":63,"orcid":63},{"paper_id":10241,"author_seq":232,"given_name":2187,"surname":10250,"affiliation":63,"orcid":63},"Scherrer",{"paper_id":10241,"author_seq":218,"given_name":4515,"surname":10252,"affiliation":63,"orcid":63},"Glikman",{"paper_id":10241,"author_seq":203,"given_name":5190,"surname":5191,"affiliation":63,"orcid":63},{"paper_id":10241,"author_seq":188,"given_name":9004,"surname":10255,"affiliation":63,"orcid":63},"Benzitoun",{"paper_id":10241,"author_seq":172,"given_name":2089,"surname":10257,"affiliation":63,"orcid":63},"Boula de Mareüil","We present the crowdsourcing platform Donnez Votre Français à la Science (DFS, or “Give your French to Science”), which aims to collect linguistic data and document language use, with a special focus on regional variation in European French. The activities not only gather data that is useful for scientific studies, but they also provide feedback to the general public; this is important in order to reward participants, to encourage them to follow future surveys, and to foster interaction with the scientific community. The two main activities described here are 1) a linguistic survey on lexical variation with immediate feedback and 2) a speaker geolocalisation system; i.e., a quiz that guesses the linguistic origin of the participant by comparing their answers with previously gathered linguistic data. For the geolocalisation activity, we set up a simulation framework to optimise predictions. Three classification algorithms are compared: the first one uses clustering and shibboleth detection, whereas the other two rely on feature elimination techniques with Support Vector Machines and Maximum Entropy models as underlying base classifiers. The best-performing system uses a selection of 17 questions and reaches a localisation accuracy of 66%, extending the prediction from the one-best area (one among 109 base areas) to its first-order and second-order neighbouring areas.",{"paper_id":10260,"title":10261,"year":81,"month":855,"day":63,"doi":10262,"resource_url":10263,"first_page":63,"last_page":63,"pdf_url":10264,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10265,"paper_type":860,"authors":10266,"abstract":10286},"lrec2018-main-528","Improving Machine Translation of Educational Content via Crowdsourcing","10.63317\u002F4oexxebwbmz6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-528","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F855.pdf","behnke-etal-2018-improving",[10267,10270,10273,10274,10275,10276,10277,10278,10279,10280,10282,10283,10284,10285],{"paper_id":10260,"author_seq":247,"given_name":10268,"surname":10269,"affiliation":63,"orcid":63},"Maximiliana","Behnke",{"paper_id":10260,"author_seq":232,"given_name":10271,"surname":10272,"affiliation":63,"orcid":63},"Antonio Valerio","Miceli Barone",{"paper_id":10260,"author_seq":218,"given_name":943,"surname":944,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":203,"given_name":2295,"surname":2296,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":188,"given_name":2289,"surname":2290,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":172,"given_name":2286,"surname":2287,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":155,"given_name":2301,"surname":2302,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":138,"given_name":2304,"surname":2305,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":121,"given_name":2353,"surname":2354,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":104,"given_name":949,"surname":10281,"affiliation":63,"orcid":63},"Gaspari",{"paper_id":10260,"author_seq":87,"given_name":2307,"surname":2308,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":73,"given_name":2310,"surname":2311,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":55,"given_name":1771,"surname":1814,"affiliation":63,"orcid":63},{"paper_id":10260,"author_seq":38,"given_name":2292,"surname":2293,"affiliation":63,"orcid":63},"The limited availability of in-domain training data is a major issue in the training of application-specific neural machine translation models. Professional outsourcing of bilingual data collections is costly and often not feasible. In this paper we analyze the influence of using crowdsourcing as a scalable way to obtain translations of target in-domain data having in mind that the translations can be of a lower quality. We apply crowdsourcing with carefully designed quality controls to create parallel corpora for the educational domain by collecting translations of texts from MOOCs from English to eleven languages, which we then use to fine-tune neural machine translation models previously trained on general-domain data. The results from our research indicate that crowdsourced data collected with proper quality controls consistently yields performance gains over general-domain baseline systems, and systems fine-tuned with pre-existing in-domain corpora.",{"paper_id":10288,"title":10289,"year":81,"month":855,"day":63,"doi":10290,"resource_url":10291,"first_page":63,"last_page":63,"pdf_url":10292,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10293,"paper_type":860,"authors":10294,"abstract":10306},"lrec2018-main-529","Grounding Gradable Adjectives through Crowdsourcing","10.63317\u002F5ii9w43m49ix","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-529","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F977.pdf","sharp-etal-2018-grounding",[10295,10297,10299,10302,10305],{"paper_id":10288,"author_seq":247,"given_name":9970,"surname":10296,"affiliation":63,"orcid":63},"Sharp",{"paper_id":10288,"author_seq":232,"given_name":10298,"surname":3690,"affiliation":63,"orcid":63},"Mithun",{"paper_id":10288,"author_seq":218,"given_name":10300,"surname":10301,"affiliation":63,"orcid":63},"Ajay","Nagesh",{"paper_id":10288,"author_seq":203,"given_name":10303,"surname":10304,"affiliation":63,"orcid":63},"Dane","Bell",{"paper_id":10288,"author_seq":188,"given_name":2819,"surname":2820,"affiliation":63,"orcid":63},"In order to build technology that has the ability to answer questions relevant to national and global security, e.g., on food insecurity in certain parts of the world, one has to implement machine reading technology that extracts causal mechanisms from texts. Unfortunately, many of these texts describe these interactions using vague, high-level language. One particular example is the use of gradable adjectives, i.e., adjectives that can take a range of magnitudes such as small or slight.  Here we propose a method for estimating specific concrete groundings for a set of such gradable adjectives.  We use crowdsourcing to gather human language intuitions about the impact of each adjective, then fit a linear mixed effects model to this data. The resulting model is able to estimate the impact of novel instances of these adjectives found in text. We evaluate our model in terms of its ability to generalize to unseen data and find that it has a predictive R2 of 0.632 in general, and 0.677 on a subset of high-frequency adjectives.",{"paper_id":10308,"title":10309,"year":81,"month":855,"day":63,"doi":10310,"resource_url":10311,"first_page":63,"last_page":63,"pdf_url":10312,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10313,"paper_type":860,"authors":10314,"abstract":10328},"lrec2018-main-530","Evaluation Phonemic Transcription of Low-Resource Tonal Languages for Language Documentation","10.63317\u002F2q2xrdffvva2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-530","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F490.pdf","adams-etal-2018-evaluation",[10315,10317,10319,10321,10324,10326],{"paper_id":10308,"author_seq":247,"given_name":1049,"surname":10316,"affiliation":63,"orcid":63},"Adams",{"paper_id":10308,"author_seq":232,"given_name":7651,"surname":10318,"affiliation":63,"orcid":63},"Cohn",{"paper_id":10308,"author_seq":218,"given_name":10155,"surname":10320,"affiliation":63,"orcid":63},"Neubig",{"paper_id":10308,"author_seq":203,"given_name":10322,"surname":10323,"affiliation":63,"orcid":63},"Hilaria","Cruz",{"paper_id":10308,"author_seq":188,"given_name":1085,"surname":10325,"affiliation":63,"orcid":63},"Bird",{"paper_id":10308,"author_seq":172,"given_name":1127,"surname":10327,"affiliation":63,"orcid":63},"Michaud","Transcribing speech is an important part of language documentation, yet speech recognition technology has not been widely harnessed to aid linguists. We explore the use of a neural network architecture with the connectionist temporal classification loss function for phonemic and tonal transcription in a language documentation setting.                                In this framework, we explore jointly modelling phonemes and tones versus       modelling them separately, and assess the importance of pitch information       versus phonemic context for tonal prediction. Experiments on two tonal languages, Yongning Na and Eastern Chatino, show the changes in recognition performance as training data is scaled from 10 minutes up to 50 minutes for Chatino, and up to 224 minutes for Na. We discuss the findings                                     from incorporating this technology into the linguistic workflow for documenting Yongning Na, which show the method's promise in improving efficiency,               minimizing typographical errors, and maintaining the transcription's                faithfulness to the acoustic signal, while highlighting phonetic and phonemic   facts for linguistic consideration.",{"paper_id":10330,"title":10331,"year":81,"month":855,"day":63,"doi":10332,"resource_url":10333,"first_page":63,"last_page":63,"pdf_url":10334,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10335,"paper_type":860,"authors":10336,"abstract":10366},"lrec2018-main-531","A Very Low Resource Language Speech Corpus for Computational Language Documentation Experiments","10.63317\u002F52v32ouunucq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-531","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F694.pdf","godard-etal-2018-low",[10337,10339,10342,10343,10346,10347,10350,10353,10354,10356,10357,10359,10361,10363],{"paper_id":10330,"author_seq":247,"given_name":1388,"surname":10338,"affiliation":63,"orcid":63},"Godard",{"paper_id":10330,"author_seq":232,"given_name":10340,"surname":10341,"affiliation":63,"orcid":63},"Gilles","Adda",{"paper_id":10330,"author_seq":218,"given_name":5170,"surname":5171,"affiliation":63,"orcid":63},{"paper_id":10330,"author_seq":203,"given_name":10344,"surname":10345,"affiliation":63,"orcid":63},"Juan","Benjumea",{"paper_id":10330,"author_seq":188,"given_name":866,"surname":867,"affiliation":63,"orcid":63},{"paper_id":10330,"author_seq":172,"given_name":10348,"surname":10349,"affiliation":63,"orcid":63},"Jamison","Cooper-Leavitt",{"paper_id":10330,"author_seq":155,"given_name":10351,"surname":10352,"affiliation":63,"orcid":63},"Guy-Noel","Kouarata",{"paper_id":10330,"author_seq":138,"given_name":5173,"surname":5174,"affiliation":63,"orcid":63},{"paper_id":10330,"author_seq":121,"given_name":10073,"surname":10355,"affiliation":63,"orcid":63},"Maynard",{"paper_id":10330,"author_seq":104,"given_name":1771,"surname":6078,"affiliation":63,"orcid":63},{"paper_id":10330,"author_seq":87,"given_name":1019,"surname":10358,"affiliation":63,"orcid":63},"Rialland",{"paper_id":10330,"author_seq":73,"given_name":5671,"surname":10360,"affiliation":63,"orcid":63},"Stueker",{"paper_id":10330,"author_seq":55,"given_name":3525,"surname":10362,"affiliation":63,"orcid":63},"Yvon",{"paper_id":10330,"author_seq":38,"given_name":10364,"surname":10365,"affiliation":63,"orcid":63},"Marcely","Zanon-Boito","Most speech and language technologies are trained with massive amounts of speech and text information. However, most of the world languages do not have such resources and some even lack a stable orthography. Building systems under these almost zero resource conditions is not only promising for speech technology but also for computational language documentation. The goal of computational language documentation is to help field linguists to (semi-)automatically analyze and annotate audio recordings of endangered, unwritten languages. Example tasks are automatic phoneme discovery or lexicon discovery from the speech signal. This paper presents a speech corpus collected during a realistic language documentation process. It is made up of 5k speech utterances in Mboshi (Bantu C25) aligned to French text translations. Speech transcriptions are also made available: they correspond to a non-standard graphemic form close to the language phonology.  We detail how the data was collected, cleaned and processed and we illustrate its use through a zero-resource task: spoken term discovery. The dataset is made available to the community  for reproducible computational language documentation experiments and their evaluation.",{"paper_id":10368,"title":10369,"year":81,"month":855,"day":63,"doi":10370,"resource_url":10371,"first_page":63,"last_page":63,"pdf_url":10372,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10373,"paper_type":860,"authors":10374,"abstract":10382},"lrec2018-main-532","Chahta Anumpa: A multimodal corpus of the Choctaw Language","10.63317\u002F2gsesyabokta","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-532","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F822.pdf","brixey-etal-2018-chahta",[10375,10378,10381],{"paper_id":10368,"author_seq":247,"given_name":10376,"surname":10377,"affiliation":63,"orcid":63},"Jacqueline","Brixey",{"paper_id":10368,"author_seq":232,"given_name":10379,"surname":10380,"affiliation":63,"orcid":63},"Eli","Pincus",{"paper_id":10368,"author_seq":218,"given_name":1208,"surname":1209,"affiliation":63,"orcid":63},"This paper presents a general use corpus for the Native American indigenous language Choctaw. The corpus contains audio, video, and text resources, with many texts also translated in English. The Oklahoma Choctaw and the Mississippi Choctaw variants of the language are represented in the corpus.  The data set provides documentation support for the threatened language, and allows researchers and language teachers access to a diverse collection of resources.",{"paper_id":10384,"title":10385,"year":81,"month":855,"day":63,"doi":10386,"resource_url":10387,"first_page":63,"last_page":63,"pdf_url":10388,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10389,"paper_type":860,"authors":10390,"abstract":10404},"lrec2018-main-533","BULBasaa: A Bilingual Basaa-French Speech Corpus for the Evaluation of Language Documentation Tools","10.63317\u002F29r9yggbjv5c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-533","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F948.pdf","hamlaoui-etal-2018-bulbasaa",[10391,10394,10397,10398,10400,10401,10402],{"paper_id":10384,"author_seq":247,"given_name":10392,"surname":10393,"affiliation":63,"orcid":63},"Fatima","Hamlaoui",{"paper_id":10384,"author_seq":232,"given_name":10395,"surname":10396,"affiliation":63,"orcid":63},"Emmanuel-Moselly","Makasso",{"paper_id":10384,"author_seq":218,"given_name":1771,"surname":6936,"affiliation":63,"orcid":63},{"paper_id":10384,"author_seq":203,"given_name":1278,"surname":10399,"affiliation":63,"orcid":63},"Engelmann",{"paper_id":10384,"author_seq":188,"given_name":10340,"surname":10341,"affiliation":63,"orcid":63},{"paper_id":10384,"author_seq":172,"given_name":2050,"surname":6711,"affiliation":63,"orcid":63},{"paper_id":10384,"author_seq":155,"given_name":5671,"surname":10403,"affiliation":63,"orcid":63},"Stüker","Basaa is one of the three Bantu languages of BULB (Breaking the Unwritten Language Barrier), a project whose aim is to provide NLP-based tools to support linguists in documenting under-resourced and unwritten languages. To develop technologies such as automatic phone transcription or machine translation, a massive amount of speech data is needed. Approximately 50 hours of Basaa speech were thus collected and then carefully re-spoken and orally translated into French in a controlled environment by a few bilingual speakers. For a subset of approx. 10 hours of the corpus, each utterance was additionally phonetically transcribed to establish a golden standard for the output of our NLP tools. The experiments described in this paper are meant to provide an automatic phonetic transcription using a set of derived phone-like units. As every language features a specific set of idiosyncrasies, automating the process of phonetic unit discovery in its entirety is a challenging task. Within BULB, we envision a workflow where linguists are able to refine the set of automatically discovered units and the system is then able to re-iterate on the data, providing a better approximation of the actual phone set.",{"paper_id":10406,"title":10407,"year":81,"month":855,"day":63,"doi":10408,"resource_url":10409,"first_page":63,"last_page":63,"pdf_url":10410,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10411,"paper_type":860,"authors":10412,"abstract":10416},"lrec2018-main-534","Researching Less-Resourced Languages – the DigiSami Corpus","10.63317\u002F2t3iu5mp6chy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-534","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F954.pdf","jokinen-2018-researching",[10413],{"paper_id":10406,"author_seq":247,"given_name":10414,"surname":10415,"affiliation":63,"orcid":63},"Kristiina","Jokinen","Increased use of digital devices and data repositories has enabled a digital revolution in data collection and language research, and has also led to important activities supporting speech and language technology research for less-resourced languages. This paper describes the DigiSami project and its research results, focussing on spoken corpus collection and speech technology for the Fenno-Ugric language North Sami. The paper also discusses multifaceted questions on ethics and privacy related to data collection for less-resourced languages and indigenous communities.",{"paper_id":10418,"title":10419,"year":81,"month":855,"day":63,"doi":10420,"resource_url":10421,"first_page":63,"last_page":63,"pdf_url":10422,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10423,"paper_type":860,"authors":10424,"abstract":10443},"lrec2018-main-535","The MADAR Arabic Dialect Corpus and Lexicon","10.63317\u002F4wv9mx9ftkrk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-535","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F351.pdf","bouamor-etal-2018-madar",[10425,10426,10427,10429,10430,10432,10435,10436,10437,10440,10442],{"paper_id":10418,"author_seq":247,"given_name":8344,"surname":8345,"affiliation":63,"orcid":63},{"paper_id":10418,"author_seq":232,"given_name":3646,"surname":3647,"affiliation":63,"orcid":63},{"paper_id":10418,"author_seq":218,"given_name":1437,"surname":10428,"affiliation":63,"orcid":63},"Salameh",{"paper_id":10418,"author_seq":203,"given_name":3042,"surname":3043,"affiliation":63,"orcid":63},{"paper_id":10418,"author_seq":188,"given_name":1727,"surname":10431,"affiliation":63,"orcid":63},"Rambow",{"paper_id":10418,"author_seq":172,"given_name":10433,"surname":10434,"affiliation":63,"orcid":63},"Dana","Abdulrahim",{"paper_id":10418,"author_seq":155,"given_name":8337,"surname":8338,"affiliation":63,"orcid":63},{"paper_id":10418,"author_seq":138,"given_name":8340,"surname":8341,"affiliation":63,"orcid":63},{"paper_id":10418,"author_seq":121,"given_name":10438,"surname":10439,"affiliation":63,"orcid":63},"Fadhl","Eryani",{"paper_id":10418,"author_seq":104,"given_name":2736,"surname":10441,"affiliation":63,"orcid":63},"Erdmann",{"paper_id":10418,"author_seq":87,"given_name":8348,"surname":8349,"affiliation":63,"orcid":63},"In this paper, we present two resources that were created as part of the Multi Arabic Dialect Applications and Resources (MADAR) project. The first is a large parallel corpus of 25 Arabic city dialects in the travel domain. The second is a lexicon of 1,045 concepts with an average of 45 words from 25 cities per concept. These resources are the first of their kind in terms of the breadth of their coverage and the fine location granularity. The focus on cities, as opposed to regions in studying Arabic dialects, opens new avenues to many areas of research from dialectology to dialect identification and machine translation.",{"paper_id":10445,"title":10446,"year":81,"month":855,"day":63,"doi":10447,"resource_url":10448,"first_page":63,"last_page":63,"pdf_url":10449,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10450,"paper_type":860,"authors":10451,"abstract":10457},"lrec2018-main-536","Designing a Collaborative Process to Create Bilingual Dictionaries of Indonesian Ethnic Languages","10.63317\u002F54arcphjz35s","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-536","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F375.pdf","nasution-etal-2018-designing",[10452,10455,10456],{"paper_id":10445,"author_seq":247,"given_name":10453,"surname":10454,"affiliation":63,"orcid":63},"Arbi Haza","Nasution",{"paper_id":10445,"author_seq":232,"given_name":4124,"surname":10089,"affiliation":63,"orcid":63},{"paper_id":10445,"author_seq":218,"given_name":10091,"surname":10092,"affiliation":63,"orcid":63},"The constraint-based approach has been proven useful for inducing bilingual dictionary for closely-related low-resource languages. When we want to create multiple bilingual dictionaries linking several languages, we need to consider manual creation by a native speaker if there are no available machine-readable dictionaries are available as input. To overcome the difficulty in planning the creation of bilingual dictionaries, the consideration of various methods and costs, plan optimization is essential. Utilizing both constraint-based approach and plan optimizer, we design a collaborative process for creating 10 bilingual dictionaries from every combination of 5 languages, i.e., Indonesian, Malay, Minangkabau, Javanese, and Sundanese. We further design an online collaborative dictionary generation to bridge spatial gap between native speakers. We define a heuristic plan that only utilizes manual investment by the native speaker to evaluate our optimal plan with total cost as an evaluation metric. The optimal plan outperformed the heuristic plan with a 63.3% cost reduction.",{"paper_id":10459,"title":10460,"year":81,"month":855,"day":63,"doi":10461,"resource_url":10462,"first_page":63,"last_page":63,"pdf_url":10463,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10464,"paper_type":860,"authors":10465,"abstract":10470},"lrec2018-main-537","Constructing a Lexicon of Relational Nouns","10.63317\u002F27a9bt4m4kvq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-537","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F461.pdf","newell-cheung-2018-constructing",[10466,10467],{"paper_id":10459,"author_seq":247,"given_name":1073,"surname":10188,"affiliation":63,"orcid":63},{"paper_id":10459,"author_seq":232,"given_name":10468,"surname":10469,"affiliation":63,"orcid":63},"Jackie C.K.","Cheung","Relational nouns refer to an entity by virtue of how it relates to another entity. Their identification in text is a prerequisite for the correct semantic interpretation of a sentence, and could be used to improve information extraction.  Although various systems for extracting relations expressed using nouns have been developed, there are no dedicated lexical resources for relational nouns. We contribute a lexicon of 6,224 labeled nouns which includes 1,446 relational nouns.  We describe the bootstrapped annotation of relational nouns, and develop a classifier that achieves 70.4% F1 when tested on held out nouns that are among the most common 2,500 word types in Gigaword.  We make the lexicon and classifier available to the scientific community.",{"paper_id":10472,"title":10473,"year":81,"month":855,"day":63,"doi":10474,"resource_url":10475,"first_page":63,"last_page":63,"pdf_url":10476,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10477,"paper_type":860,"authors":10478,"abstract":10481},"lrec2018-main-538","Creating Large-Scale Multilingual Cognate Tables","10.63317\u002F3qihfuz28ged","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-538","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F934.pdf","wu-yarowsky-2018-creating",[10479,10480],{"paper_id":10472,"author_seq":247,"given_name":3702,"surname":3703,"affiliation":63,"orcid":63},{"paper_id":10472,"author_seq":232,"given_name":1199,"surname":3705,"affiliation":63,"orcid":63},"Low-resource languages often suffer from a lack of high-coverage lexical resources. In this paper, we propose a method to generate cognate tables by clustering words from existing lexical resources. We then employ character-based machine translation methods in solving the task of cognate chain completion by inducing missing word translations from lower-coverage dictionaries to fill gaps in the cognate chain, finding improvements over single language pair baselines when employing simple but novel multi-language system combination on the Romance and Turkic language families. For the Romance family, we show that system combination using the results of clustering outperforms weights derived from the historical-linguistic scholarship on language phylogenies. Our approach is applicable to any language family and has not been previously performed at such scale. The cognate tables are released to the research community.",{"paper_id":10483,"title":10484,"year":81,"month":855,"day":63,"doi":10485,"resource_url":10486,"first_page":63,"last_page":63,"pdf_url":10487,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10488,"paper_type":860,"authors":10489,"abstract":10494},"lrec2018-main-539","Lexical Profiling of Environmental Corpora","10.63317\u002F44bfrjvhx2no","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-539","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F752.pdf","drouin-etal-2018-lexical",[10490,10492,10493],{"paper_id":10483,"author_seq":247,"given_name":2451,"surname":10491,"affiliation":63,"orcid":63},"Drouin",{"paper_id":10483,"author_seq":232,"given_name":9573,"surname":9574,"affiliation":63,"orcid":63},{"paper_id":10483,"author_seq":218,"given_name":6238,"surname":9576,"affiliation":63,"orcid":63},"This paper describes a method for distinguishing lexical layers in environmental corpora (i.e. the general lexicon, the transdisciplinary lexicon and two sets of lexical items related to the domain). More specifically we aim to identify the general environmental lexicon (GEL) and assess the extent to which we can set it apart from the others. The general intuition on which this research is based is that the GEL is both well-distributed in a specialized corpus (criterion 1) and specific to this type of corpora (criterion 2). The corpus used in the current experiment, made of 6 subcorpora that amount to 4.6 tokens, was compiled manually by terminologists for different projects designed to enrich a terminological resource. In order to meet criterion 1, the distribution of the GEL candidates is evaluated using a simple and well-known measure called. As for criterion 2, GEL candidates are extracted using a term extractor, which provides a measure of their specificity relative to a corpus. Our study focuses on single-word lexical items including nouns, verbs and adjectives. The results were validated by a team of 4 annotators who are all familiar with the environmental lexicon and they  show that using a high specificity threshold and a low idf threshold constitutes a good starting point to identify the GEL layer in our corpora.",{"paper_id":10496,"title":10497,"year":81,"month":855,"day":63,"doi":10498,"resource_url":10499,"first_page":63,"last_page":63,"pdf_url":10500,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10501,"paper_type":860,"authors":10502,"abstract":10507},"lrec2018-main-540","Linking, Searching, and Visualizing Entities in Wikipedia","10.63317\u002F5dcwxe8u8zw7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-540","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F93.pdf","klang-nugues-2018-linking",[10503,10505],{"paper_id":10496,"author_seq":247,"given_name":7549,"surname":10504,"affiliation":63,"orcid":63},"Klang",{"paper_id":10496,"author_seq":232,"given_name":1388,"surname":10506,"affiliation":63,"orcid":63},"Nugues","In this paper, we describe a new system to extract, index, search, and visualize entities in Wikipedia. To carry out the entity extraction, we designed a high-performance, multilingual, entity linker and we used a document model to store the resulting linguistic annotations. The entity linker, HEDWIG, extracts the mentions from text using a string matching engine and links them to entities with a combination of statistical rules and PageRank. The document model, Docforia, consists of layers, where each layer is a sequence of ranges describing a specific annotation, here the entities. We evaluated HEDWIG with the TAC 2016 data and protocol and we reached the CEAFm scores of 70.0 on English, on 64.4 on Chinese, and 66.5 on Spanish. We applied the entity linker to the whole collection of English and Swedish articles of Wikipedia and we used Lucene to index the layers and a search module to interactively retrieve all the concordances of an entity in Wikipedia. The user can select and visualize the concordances in the articles or paragraphs. Contrary to classic text indexing, this system does not use strings to identify the entities but unique identifiers from Wikidata. A demonstration of the entity search and visualization will be available for English at this address http:\u002F\u002Fvilde.cs.lth.se:9001\u002Fen-hedwig\u002F and for Swedish at: http:\u002F\u002Fvilde.cs.lth.se:9001\u002Fsv-hedwig\u002F.",{"paper_id":10509,"title":10510,"year":81,"month":855,"day":63,"doi":10511,"resource_url":10512,"first_page":63,"last_page":63,"pdf_url":10513,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10514,"paper_type":860,"authors":10515,"abstract":10520},"lrec2018-main-541","Learning to Map Natural Language Statements into Knowledge Base Representations for Knowledge Base Construction","10.63317\u002F46di8wkiyfhy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-541","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F94.pdf","lin-etal-2018-learning-map",[10516,10518,10519],{"paper_id":10509,"author_seq":247,"given_name":10517,"surname":9639,"affiliation":63,"orcid":63},"Chin-Ho",{"paper_id":10509,"author_seq":232,"given_name":3505,"surname":3506,"affiliation":63,"orcid":63},{"paper_id":10509,"author_seq":218,"given_name":3511,"surname":2401,"affiliation":63,"orcid":63},"Directly adding the knowledge triples obtained from open information extraction systems into a knowledge base is often impractical due to a vocabulary gap between natural language (NL) expressions and knowledge base (KB) representation. This paper aims at learning to map relational phrases in triples from natural-language-like statement to knowledge base predicate format. We train a word representation model on a vector space and link each NL relational pattern to the semantically equivalent KB predicate. Our mapping result shows not only high quality, but also promising coverage on relational phrases compared to previous research.",{"paper_id":10522,"title":10523,"year":81,"month":855,"day":63,"doi":10524,"resource_url":10525,"first_page":63,"last_page":63,"pdf_url":10526,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10527,"paper_type":860,"authors":10528,"abstract":10532},"lrec2018-main-542","Building a Knowledge Graph from Natural Language Definitions for Interpretable Text Entailment Recognition","10.63317\u002F3t8ubr3k55yk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-542","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F190.pdf","silva-etal-2018-building",[10529,10530,10531],{"paper_id":10522,"author_seq":247,"given_name":2905,"surname":4259,"affiliation":63,"orcid":63},{"paper_id":10522,"author_seq":232,"given_name":4788,"surname":4789,"affiliation":63,"orcid":63},{"paper_id":10522,"author_seq":218,"given_name":4791,"surname":4792,"affiliation":63,"orcid":63},"Natural language definitions of terms can serve as a rich source of knowledge, but structuring them into a comprehensible semantic model is essential to enable them to be used in semantic interpretation tasks. We propose a method and provide a set of tools for automatically building a graph world knowledge base from natural language definitions. Adopting a conceptual model composed of a set of semantic roles for dictionary definitions, we trained a classifier for automatically labeling definitions, preparing the data to be later converted to a graph representation. WordNetGraph, a knowledge graph built out of noun and verb WordNet definitions according to this methodology, was successfully used in an interpretable text entailment recognition approach which uses paths in this graph to provide clear justifications for entailment decisions.",{"paper_id":10534,"title":10535,"year":81,"month":855,"day":63,"doi":10536,"resource_url":10537,"first_page":63,"last_page":63,"pdf_url":10538,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10539,"paper_type":860,"authors":10540,"abstract":10549},"lrec2018-main-543","Combining rule-based and embedding-based approaches to normalize textual entities with an ontology","10.63317\u002F2fvbgwgrd6zp","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-543","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F446.pdf","ferre-etal-2018-combining",[10541,10543,10546,10547],{"paper_id":10534,"author_seq":247,"given_name":7055,"surname":10542,"affiliation":63,"orcid":63},"Ferré",{"paper_id":10534,"author_seq":232,"given_name":10544,"surname":10545,"affiliation":63,"orcid":63},"Louise","Deléger",{"paper_id":10534,"author_seq":218,"given_name":1388,"surname":1389,"affiliation":63,"orcid":63},{"paper_id":10534,"author_seq":203,"given_name":1217,"surname":10548,"affiliation":63,"orcid":63},"Nédellec","In this paper, we propose a two-step method to normalize multi-word terms with concepts from a domain-specific ontology. Normalization is a critical step of information extraction. The method uses vector representations of terms computed with word embedding information and hierarchical information among ontology concepts. A training dataset and a first result dataset with high precision and low recall are generated by using the ToMap unsupervised normalization method. It is based on the similarities between the form of the term to normalize and the form of concept labels. Then, a projection of the space of terms towards the space of concepts is learned by globally minimizing the distances between vectors of terms and vectors of concepts. It applies multivariate linear regression using the previously generated training dataset. Finally, a distance calculation is carried out between the projections of term vectors and the concept vectors, providing a prediction of normalization by a concept for each term. This method was evaluated through the categorization task of bacterial habitats of BioNLP Shared Task 2016. Our results largely outperform all existing systems on this task, opening up very encouraging prospects.",{"paper_id":10551,"title":10552,"year":81,"month":855,"day":63,"doi":10553,"resource_url":10554,"first_page":63,"last_page":63,"pdf_url":10555,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10556,"paper_type":860,"authors":10557,"abstract":10577},"lrec2018-main-544","T-REx: A Large Scale Alignment of Natural Language with Knowledge Base Triples","10.63317\u002F2yu5nu76n7bu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-544","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F632.pdf","elsahar-etal-2018-rex",[10558,10561,10564,10567,10569,10572,10575],{"paper_id":10551,"author_seq":247,"given_name":10559,"surname":10560,"affiliation":63,"orcid":63},"Hady","Elsahar",{"paper_id":10551,"author_seq":232,"given_name":10562,"surname":10563,"affiliation":63,"orcid":63},"Pavlos","Vougiouklis",{"paper_id":10551,"author_seq":218,"given_name":10565,"surname":10566,"affiliation":63,"orcid":63},"Arslen","Remaci",{"paper_id":10551,"author_seq":203,"given_name":9004,"surname":10568,"affiliation":63,"orcid":63},"Gravier",{"paper_id":10551,"author_seq":188,"given_name":10570,"surname":10571,"affiliation":63,"orcid":63},"Jonathon","Hare",{"paper_id":10551,"author_seq":172,"given_name":10573,"surname":10574,"affiliation":63,"orcid":63},"Frederique","Laforest",{"paper_id":10551,"author_seq":155,"given_name":3684,"surname":10576,"affiliation":63,"orcid":63},"Simperl","Alignments between natural language and Knowledge Base (KB) triples are an essential prerequisite for training machine learning approaches employed in a variety of Natural Language Processing problems. These include Relation Extraction, KB Population, Question Answering and Natural Language Generation from KB triples. Available datasets that provide those alignments are plagued by significant shortcomings – they are of limited size, they exhibit a restricted predicate coverage, and\u002For they are of unreported quality. To alleviate these shortcomings, we present T-REx, a dataset of large-scale alignments between Wikipedia abstracts and Wikidata triples.T-REx consists of 11 million triples aligned with 3.09 million Wikipedia abstracts (6.2 million sentences). T-REx is two orders of magnitude larger than the largest available alignments dataset and covers 2.5 times more predicates. Additionally, we stress the quality of this language resource thanks to an extensive crowdsourcing evaluation. T-REx is publicly available at https:\u002F\u002Fw3id.org\u002Ft-rex.",{"paper_id":10579,"title":10580,"year":81,"month":855,"day":63,"doi":10581,"resource_url":10582,"first_page":63,"last_page":63,"pdf_url":10583,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10584,"paper_type":860,"authors":10585,"abstract":10591},"lrec2018-main-545","Multilingual Parallel Corpus for Global Communication Plan","10.63317\u002F3bndwgz6k5uw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-545","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F104.pdf","imamura-sumita-2018-multilingual",[10586,10588],{"paper_id":10579,"author_seq":247,"given_name":2474,"surname":10587,"affiliation":63,"orcid":63},"Imamura",{"paper_id":10579,"author_seq":232,"given_name":10589,"surname":10590,"affiliation":63,"orcid":63},"Eiichiro","Sumita","In this paper, we introduce the Global Communication Plan (GCP) Corpus, a multilingual parallel corpus being developed as part of the GCP. The GCP Corpus is intended to be develop speech translation systems; thus, it primarily consists of pseudo-dialogues between foreign visitors and local Japanese people. The GCP Corpus is sentence-aligned and covers four domains and ten languages, including many Asian languages. In this paper, we summarize the GCP and the current status of the GCP Corpus. Then, we describe some of the corpus' basic characteristics from the perspective of multilingual machine translation and compare direct, pivot, and zero-shot translation techniques.",{"paper_id":10593,"title":10594,"year":81,"month":855,"day":63,"doi":10595,"resource_url":10596,"first_page":63,"last_page":63,"pdf_url":10597,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10598,"paper_type":860,"authors":10599,"abstract":10607},"lrec2018-main-546","A Large Parallel Corpus of Full-Text Scientific Articles","10.63317\u002F2drfz27mfdwo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-546","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F370.pdf","soares-etal-2018-large",[10600,10603,10605],{"paper_id":10593,"author_seq":247,"given_name":10601,"surname":10602,"affiliation":63,"orcid":63},"Felipe","Soares",{"paper_id":10593,"author_seq":232,"given_name":10604,"surname":6977,"affiliation":63,"orcid":63},"Viviane",{"paper_id":10593,"author_seq":218,"given_name":1759,"surname":10606,"affiliation":63,"orcid":63},"Becker","The Scielo database is an important source of scientific information in Latin America, containing articles from several research domains. A striking characteristic of Scielo is that many of its full-text contents are presented in more than one language, thus being a potential source of parallel corpora. In this article, we present the development of a parallel corpus from Scielo in three languages: English, Portuguese, and Spanish. Sentences were automatically aligned using the Hunalign algorithm for all language pairs, and for a subset of trilingual articles also. We demonstrate the capabilities of our corpus by training a Statistical Machine Translation system (Moses) for each language pair, which outperformed related works on scientific articles. Sentence alignment was also manually evaluated, presenting an average of 98.8% correctly aligned sentences across all languages. Our parallel corpus is freely available in the TMX format, with complementary information regarding article metadata.",{"paper_id":10609,"title":10610,"year":81,"month":855,"day":63,"doi":10611,"resource_url":10612,"first_page":63,"last_page":63,"pdf_url":10613,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10614,"paper_type":860,"authors":10615,"abstract":10620},"lrec2018-main-547","NegPar: A parallel corpus annotated for negation","10.63317\u002F26bd8vpqnjtq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-547","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F500.pdf","liu-etal-2018-negpar",[10616,10618,10619],{"paper_id":10609,"author_seq":247,"given_name":10617,"surname":5232,"affiliation":63,"orcid":63},"Qianchu",{"paper_id":10609,"author_seq":232,"given_name":949,"surname":950,"affiliation":63,"orcid":63},{"paper_id":10609,"author_seq":218,"given_name":946,"surname":947,"affiliation":63,"orcid":63},"Although the existence of English corpora annotated for negation has allowed for extensive work on monolingual negation detection, little is understood on how negation-related phenomena translate across languages. The current study fills this gap by presenting NegPar, the first English-Chinese parallel corpus annotated for negation in the narrative domain (a collection of stories from Conan Doyle's Sherlock Holmes). While we followed the annotation guidelines in the ConanDoyleNeg corpus (Morante and Daelemans, 2012), we reannotated certain scope-related phenomena to ensure more consistent and interpretable semantic representation. To both ease the annotation process and analyze how similar negation is signaled in the two languages, we experimented with first projecting the annotations from English and then manually correcting the projection output in Chinese. Results show that projecting negation via word-alignment offers limited help to the annotation process, as negation can be rendered in different ways across languages.",{"paper_id":10622,"title":10623,"year":81,"month":855,"day":63,"doi":10624,"resource_url":10625,"first_page":63,"last_page":63,"pdf_url":10626,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10627,"paper_type":860,"authors":10628,"abstract":10636},"lrec2018-main-548","The IIT Bombay English-Hindi Parallel Corpus","10.63317\u002F2erd8c5iqpzz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-548","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F847.pdf","kunchukuttan-etal-2018-iit",[10629,10632,10635],{"paper_id":10622,"author_seq":247,"given_name":10630,"surname":10631,"affiliation":63,"orcid":63},"Anoop","Kunchukuttan",{"paper_id":10622,"author_seq":232,"given_name":10633,"surname":10634,"affiliation":63,"orcid":63},"Pratik","Mehta",{"paper_id":10622,"author_seq":218,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},"We present the IIT Bombay English-Hindi Parallel Corpus. The corpus is a compilation of parallel corpora previously available in the public domain as well as new parallel corpora we collected. The corpus contains 1.49 million parallel segments, of which 694k segments were not previously available in the public domain. The corpus has been pre-processed for machine translation, and we report baseline phrase-based SMT and NMT translation results on this corpus. This corpus has been used in two editions of shared tasks at the Workshop on Asian Language Translation (2016 and 2017). The corpus is freely available for non-commercial research. To the best of our knowledge, this is the largest publicly available English-Hindi parallel corpus.",{"paper_id":10638,"title":10639,"year":81,"month":855,"day":63,"doi":10640,"resource_url":10641,"first_page":63,"last_page":63,"pdf_url":10642,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10643,"paper_type":860,"authors":10644,"abstract":10654},"lrec2018-main-549","Extracting an English-Persian Parallel Corpus from Comparable Corpora","10.63317\u002F4xyw9m5oixtg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-549","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F674.pdf","karimi-etal-2018-extracting",[10645,10648,10651],{"paper_id":10638,"author_seq":247,"given_name":10646,"surname":10647,"affiliation":63,"orcid":63},"Akbar","Karimi",{"paper_id":10638,"author_seq":232,"given_name":10649,"surname":10650,"affiliation":63,"orcid":63},"Ebrahim","Ansari",{"paper_id":10638,"author_seq":218,"given_name":10652,"surname":10653,"affiliation":63,"orcid":63},"Bahram","Sadeghi Bigham","Parallel data are an important part of a reliable Statistical Machine Translation (SMT) system. The more of these data are available, the better the quality of the SMT system. However, for some language pairs such as Persian-English, parallel sources of this kind are scarce. In this paper, a bidirectional method is proposed to extract parallel sentences from English and Persian document aligned Wikipedia. Two machine translation systems are employed to translate from Persian to English and the reverse after which an IR system is used to measure the similarity of the translated sentences. Adding the extracted sentences to the training data of the existing SMT systems is shown to improve the quality of the translation. Furthermore, the proposed method slightly outperforms the one-directional approach. The extracted corpus consists of about 200,000 sentences which have been sorted by their degree of similarity calculated by the IR system and is freely available for public access on the Web.",{"paper_id":10656,"title":10657,"year":81,"month":855,"day":63,"doi":10658,"resource_url":10659,"first_page":63,"last_page":63,"pdf_url":10660,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10661,"paper_type":860,"authors":10662,"abstract":10669},"lrec2018-main-550","Learning Word Vectors for 157 Languages","10.63317\u002F2dwkd5xar3i5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-550","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F627.pdf","grave-etal-2018-learning",[10663,10664,10665,10667,10668],{"paper_id":10656,"author_seq":247,"given_name":993,"surname":994,"affiliation":63,"orcid":63},{"paper_id":10656,"author_seq":232,"given_name":996,"surname":997,"affiliation":63,"orcid":63},{"paper_id":10656,"author_seq":218,"given_name":10666,"surname":2603,"affiliation":63,"orcid":63},"Prakhar",{"paper_id":10656,"author_seq":203,"given_name":1001,"surname":1002,"affiliation":63,"orcid":63},{"paper_id":10656,"author_seq":188,"given_name":990,"surname":991,"affiliation":63,"orcid":63},"Distributed word representations, or word vectors, have recently been applied to many tasks in natural language processing, leading to state-of-the-art performance. A key ingredient to the successful application of these representations is to train them on very large corpora, and use these pre-trained models in downstream tasks. In this paper, we describe how we trained such high quality word representations for 157 languages. We used two sources of data to train these models: the free online encyclopedia Wikipedia and data from the common crawl project. We also introduce three new word analogy datasets to evaluate these word vectors, for French, Hindi and Polish. Finally, we evaluate our pre-trained word vectors on 10 languages for which evaluation datasets exists, showing very strong performance compared to previous models.",{"paper_id":10671,"title":10672,"year":81,"month":855,"day":63,"doi":10673,"resource_url":10674,"first_page":63,"last_page":63,"pdf_url":10675,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10676,"paper_type":860,"authors":10677,"abstract":10689},"lrec2018-main-551","SumeCzech: Large Czech News-Based Summarization Dataset","10.63317\u002F4qvaw62j5n6b","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-551","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F825.pdf","straka-etal-2018-sumeczech",[10678,10679,10682,10684,10685,10688],{"paper_id":10671,"author_seq":247,"given_name":5413,"surname":5414,"affiliation":63,"orcid":63},{"paper_id":10671,"author_seq":232,"given_name":10680,"surname":10681,"affiliation":63,"orcid":63},"Nikita","Mediankin",{"paper_id":10671,"author_seq":218,"given_name":2561,"surname":10683,"affiliation":63,"orcid":63},"Kocmi",{"paper_id":10671,"author_seq":203,"given_name":6226,"surname":6227,"affiliation":63,"orcid":63},{"paper_id":10671,"author_seq":188,"given_name":10686,"surname":10687,"affiliation":63,"orcid":63},"Vojtěch","Hudeček",{"paper_id":10671,"author_seq":172,"given_name":2633,"surname":3462,"affiliation":63,"orcid":63},"Document summarization is a well-studied NLP task. With the emergence of artificial neural network models, the summarization performance is increasing, as are the requirements on training data. However, only a few datasets are available for Czech, none of them particularly large. Additionally, summarization has been evaluated predominantly on English, with the commonly used ROUGE metric being English-specific. In this paper, we try to address both issues. We present SumeCzech, a Czech news-based summarization dataset. It contains more than a million documents, each consisting of a headline, a several sentences long abstract and a full text. The dataset can be downloaded using the provided scripts available at http:\u002F\u002Fhdl.handle.net\u002F11234\u002F1-2615. We evaluate several summarization baselines on the dataset, including a strong abstractive approach based on Transformer neural network architecture. The evaluation is performed using a language-agnostic variant of ROUGE.",{"paper_id":10691,"title":10692,"year":81,"month":855,"day":63,"doi":10693,"resource_url":10694,"first_page":63,"last_page":63,"pdf_url":10695,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10696,"paper_type":860,"authors":10697,"abstract":10702},"lrec2018-main-552","A Diachronic Corpus for Literary Style Analysis","10.63317\u002F3rafdypkt8pp","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-552","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F864.pdf","klaussner-vogel-2018-diachronic",[10698,10701],{"paper_id":10691,"author_seq":247,"given_name":10699,"surname":10700,"affiliation":63,"orcid":63},"Carmen","Klaussner",{"paper_id":10691,"author_seq":232,"given_name":4210,"surname":4211,"affiliation":63,"orcid":63},"This research presents a resource  for diachronic style analysis in particular  the analysis of literary authors over time. Temporal style analysis has received comparatively  little attention over the past years in spite of the possibility of an author's style frequently changing over time,  a property that is not only interesting in its own  right, but which also has implications for synchronic  analyses of style. The corpus contains 22 American literary authors from the mid 19th to early 20th century that wrote largely in parallel. After describing the resource, we show how  the corpus can be used to detect changing features  in literary style.",{"paper_id":10704,"title":10705,"year":81,"month":855,"day":63,"doi":10706,"resource_url":10707,"first_page":63,"last_page":63,"pdf_url":10708,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10709,"paper_type":860,"authors":10710,"abstract":10720},"lrec2018-main-553","Text Simplification from Professionally Produced Corpora","10.63317\u002F5gudtrhqasz3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-553","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1063.pdf","scarton-etal-2018-text",[10711,10714,10717],{"paper_id":10704,"author_seq":247,"given_name":10712,"surname":10713,"affiliation":63,"orcid":63},"Carolina","Scarton",{"paper_id":10704,"author_seq":232,"given_name":10715,"surname":10716,"affiliation":63,"orcid":63},"Gustavo","Paetzold",{"paper_id":10704,"author_seq":218,"given_name":10718,"surname":10719,"affiliation":63,"orcid":63},"Lucia","Specia","The lack of large and reliable datasets has been hindering progress in Text Simplification (TS). We investigate the application of the recently created Newsela corpus, the largest collection of professionally written simplifications available, in TS tasks. Using new alignment algorithms, we extract 550,644 complex-simple sentence pairs from the corpus. This data is explored in different ways: (i)  we show that traditional readability metrics  capture surprisingly well the different complexity levels in this corpus, (ii) we build machine learning models to classify sentences into complex vs. simple and to predict complexity levels that outperform their respective baselines, (iii)  we introduce a lexical simplifier that uses the corpus to generate candidate simplifications and outperforms the state of the art approaches, and (iv) we show that the corpus can be used to learn sentence simplification patterns in more effective ways than corpora used in previous work.",{"paper_id":10722,"title":10723,"year":81,"month":855,"day":63,"doi":10724,"resource_url":10725,"first_page":63,"last_page":63,"pdf_url":10726,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10727,"paper_type":860,"authors":10728,"abstract":10739},"lrec2018-main-554","Intertextual Correspondence for Integrating Corpora","10.63317\u002F236ozd84gwhn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-554","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1087.pdf","visser-etal-2018-intertextual",[10729,10732,10735,10737],{"paper_id":10722,"author_seq":247,"given_name":10730,"surname":10731,"affiliation":63,"orcid":63},"Jacky","Visser",{"paper_id":10722,"author_seq":232,"given_name":10733,"surname":10734,"affiliation":63,"orcid":63},"Rory","Duthie",{"paper_id":10722,"author_seq":218,"given_name":1734,"surname":10736,"affiliation":63,"orcid":63},"Lawrence",{"paper_id":10722,"author_seq":203,"given_name":1367,"surname":10738,"affiliation":63,"orcid":63},"Reed","We present intertextual correspondence (ITC) as an integrative technique for combining annotated text corpora. The topical correspondence between different texts can be exploited to establish new annotation connections between existing corpora. Although the general idea should not be restricted to one particular theoretical framework, we explain how the annotation of intertextual correspondence works for two corpora annotated with argumentative notions on the basis of Inference Anchoring Theory. The annotated corpora we take as examples are topically and temporally related: the first corpus comprises television debates leading up to the 2016 presidential elections in the United States, the second corpus consists of commentary on and discussion of those debates on the social media platform Reddit. The integrative combination enriches the existing corpora in terms of the argumentative density, conceived of as the number of inference, conflict and rephrase relations relative to the word count of the (sub-)corpus. ITC also affects the global properties of the corpus, such as the most divisive issue. Moreover, the ability to extend existing corpora whilst maintaining the level of internal cohesion is beneficial to the use of the integrated corpus as resource for text and argument mining based on machine learning.",{"paper_id":10741,"title":10742,"year":81,"month":855,"day":63,"doi":10743,"resource_url":10744,"first_page":63,"last_page":63,"pdf_url":10745,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10746,"paper_type":860,"authors":10747,"abstract":10753},"lrec2018-main-555","A Gold Anaphora Annotation Layer on an Eye Movement Corpus","10.63317\u002F35gzmakviiyv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-555","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F318.pdf","seminck-amsili-2018-gold",[10748,10750],{"paper_id":10741,"author_seq":247,"given_name":3749,"surname":10749,"affiliation":63,"orcid":63},"Seminck",{"paper_id":10741,"author_seq":232,"given_name":10751,"surname":10752,"affiliation":63,"orcid":63},"Pascal","Amsili","Anaphora resolution is a complex process in which multiple linguistic factors play a role, and this is witnessed by a large psycholinguistic literature. This literature is based on experiments with hand-constructed items, which have the advantage to filter influences outside the scope of the study, but, as a downside, make the experimental data artificial. Our goal is to provide a first resource allowing to study human anaphora resolution on natural data. We annotated anaphorical pronouns in the Dundee Corpus: a corpus of 50k words coming from newspaper articles read by humans of whom all eye movements were recorded. We identified all anaphoric pronouns - in opposition to non-referential, cataphoric and deictic uses - and identified the closest antecedent for each of them. Both the identification of the anaphoricity and the antecedents of the pronouns showed a high inter-annotator agreement. We used our resource to model reading time of pronouns to study simultaneously various factors of influence on anaphora resolution. Whereas the influence of the anaphoric relation on the reading time of the pronoun is subtle, psycholinguistic findings from settings using experimental items were confirmed. In this way our resource provides a new means to study anaphora.",{"paper_id":10755,"title":10756,"year":81,"month":855,"day":63,"doi":10757,"resource_url":10758,"first_page":63,"last_page":63,"pdf_url":10759,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10760,"paper_type":860,"authors":10761,"abstract":10770},"lrec2018-main-556","Annotating Zero Anaphora for Question Answering","10.63317\u002F4fktwxh6ovvk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-556","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F551.pdf","asao-etal-2018-annotating",[10762,10764,10767],{"paper_id":10755,"author_seq":247,"given_name":2913,"surname":10763,"affiliation":63,"orcid":63},"Asao",{"paper_id":10755,"author_seq":232,"given_name":10765,"surname":10766,"affiliation":63,"orcid":63},"Ryu","Iida",{"paper_id":10755,"author_seq":218,"given_name":10768,"surname":10769,"affiliation":63,"orcid":63},"Kentaro","Torisawa","We constructed a large annotated dataset of zero pronouns that correspond to adjuncts marked by -de (translated to English as 'in', 'at', 'by' or 'with') in Japanese. Adjunct zero anaphora resolution plays an important role in extracting information such as location and means from a text. To our knowledge, however, there have been no large-scale dataset covering them. In this paper, focusing on the application of zero anaphora resolution to question answering (QA), we proposed two annotation schemes. The first scheme was designed to efficiently collect zero anaphora instances that are useful in QA. Instead of directly annotating zero anaphora, annotators evaluated QA instances whose correctness hinges on zero anaphora resolution. Over 20,000 instances of zero anaphora were collected with this scheme. We trained a multi-column convolutional neural network with the annotated data, achieving an average precision of 0.519 in predicting the correctness of QA instances of the same type. In the second scheme, zero anaphora is annotated in a more direct manner. A model trained with the results of the second annotation scheme performed better than the first scheme in identifying zero anaphora for sentences randomly sampled from a corpus, suggesting a tradeoff between application-specific and general-purpose annotation schemes.",{"paper_id":10772,"title":10773,"year":81,"month":855,"day":63,"doi":10774,"resource_url":10775,"first_page":63,"last_page":63,"pdf_url":10776,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10777,"paper_type":860,"authors":10778,"abstract":10789},"lrec2018-main-557","Building Named Entity Recognition Taggers via Parallel Corpora","10.63317\u002F3x5mgrguo3ab","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-557","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F965.pdf","agerri-etal-2018-building",[10779,10780,10782,10784,10787,10788],{"paper_id":10772,"author_seq":247,"given_name":1953,"surname":7501,"affiliation":63,"orcid":63},{"paper_id":10772,"author_seq":232,"given_name":10781,"surname":4236,"affiliation":63,"orcid":63},"Yiling",{"paper_id":10772,"author_seq":218,"given_name":8039,"surname":10783,"affiliation":63,"orcid":63},"Aldabe",{"paper_id":10772,"author_seq":203,"given_name":10785,"surname":10786,"affiliation":63,"orcid":63},"Nora","Aranberri",{"paper_id":10772,"author_seq":188,"given_name":8044,"surname":8045,"affiliation":63,"orcid":63},{"paper_id":10772,"author_seq":172,"given_name":6777,"surname":6778,"affiliation":63,"orcid":63},"The lack of hand curated data is a major impediment to developing statistical semantic processors for many of the world languages. Our paper aims to bridge this gap by leveraging existing annotations and semantic processors from multiple source languages by projecting their annotations via the statistical word alignments traditionally used in Machine Translation. Taking the Named Entity Recognition (NER) task as a use case, this work presents a method to automatically induce Named Entity annotated data using parallel corpora without any manual intervention. The projected annotations can then be used to automatically generate semantic processors for the target language helping to overcome the lack of training data for a given language. The experiments are focused on 4 languages: German, English, Spanish and Italian, and our empirical evaluation results show that our method obtains competitive results when compared with models trained on gold-standard, albeit out-of-domain, data. The results point out that our projection algorithm is effective to transport NER annotations across languages thus providing a fully automatic method to obtain NER taggers for as many as the number of languages aligned in parallel corpora. Every resource generated (training data, manually annotated test set and NER models) is made publicly available for its use and to facilitate reproducibility of results.",{"paper_id":10791,"title":10792,"year":81,"month":855,"day":63,"doi":10793,"resource_url":10794,"first_page":63,"last_page":63,"pdf_url":10795,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10796,"paper_type":860,"authors":10797,"abstract":10808},"lrec2018-main-558","Cross-Document, Cross-Language Event Coreference Annotation Using Event Hoppers","10.63317\u002F58cyairar5w5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-558","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F974.pdf","song-etal-2018-cross",[10798,10799,10800,10803,10805,10806],{"paper_id":10791,"author_seq":247,"given_name":5379,"surname":2599,"affiliation":63,"orcid":63},{"paper_id":10791,"author_seq":232,"given_name":5769,"surname":5770,"affiliation":63,"orcid":63},{"paper_id":10791,"author_seq":218,"given_name":10801,"surname":10802,"affiliation":63,"orcid":63},"Justin","Mott",{"paper_id":10791,"author_seq":203,"given_name":10804,"surname":1591,"affiliation":63,"orcid":63},"Xuansong",{"paper_id":10791,"author_seq":188,"given_name":1205,"surname":5377,"affiliation":63,"orcid":63},{"paper_id":10791,"author_seq":172,"given_name":1359,"surname":10807,"affiliation":63,"orcid":63},"Caruso","We discuss the development and implementation of an approach for cross-document, cross-lingual event coreference for the DEFT Rich Entities, Relations and Events (Rich ERE) annotation task. Rich ERE defined the notion of event hoppers to enable intuitive within-document coreference for the DEFT event ontology, and the expansion of coreference to cross-document, cross-lingual event mentions relies crucially on this same construct. We created new annotation guidelines, data processes and user interfaces to enable annotation of 505 documents in three languages selected from data already labeled for Rich ERE, yielding 389 cross-document event hoppers. We discuss the data creation process and the central role of event hoppers in making cross-document, cross-lingual coreference decisions. We present the challenges encountered during annotation along with three directions for future work.",{"paper_id":10810,"title":10811,"year":81,"month":855,"day":63,"doi":10812,"resource_url":10813,"first_page":63,"last_page":63,"pdf_url":10814,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10815,"paper_type":860,"authors":10816,"abstract":10827},"lrec2018-main-559","TAP-DLND 1.0 : A Corpus for Document Level Novelty Detection","10.63317\u002F3hwywkyeb4hz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-559","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F479.pdf","ghosal-etal-2018-tap",[10817,10820,10822,10825,10826],{"paper_id":10810,"author_seq":247,"given_name":10818,"surname":10819,"affiliation":63,"orcid":63},"Tirthankar","Ghosal",{"paper_id":10810,"author_seq":232,"given_name":10821,"surname":8340,"affiliation":63,"orcid":63},"Amitra",{"paper_id":10810,"author_seq":218,"given_name":10823,"surname":10824,"affiliation":63,"orcid":63},"Swati","Tiwari",{"paper_id":10810,"author_seq":203,"given_name":1861,"surname":1862,"affiliation":63,"orcid":63},{"paper_id":10810,"author_seq":188,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},"Detecting novelty of an entire document is an Artificial Intelligence (AI) frontier problem. This has immense importance in widespread Natural Language Processing (NLP) applications ranging from extractive text document summarization to tracking development of news events to predicting impact of scholarly articles. Although a very relevant problem in the present context of exponential data duplication, we are unaware of any document level dataset that correctly addresses the evaluation of automatic novelty detection techniques in a classification framework. To bridge this relative gap, here in this work, we present a resource for benchmarking the techniques for document level novelty detection. We create the resource via topic-specific crawling of news documents across several domains in a periodic manner. We release the annotated corpus with necessary statistics and show its use with a developed system for the problem in concern.",{"paper_id":10829,"title":10830,"year":81,"month":855,"day":63,"doi":10831,"resource_url":10832,"first_page":63,"last_page":63,"pdf_url":10833,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10834,"paper_type":860,"authors":10835,"abstract":10840},"lrec2018-main-560","A Corpus for Multilingual Document Classification in Eight Languages","10.63317\u002F52gbaquzq5uu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-560","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F658.pdf","schwenk-li-2018-corpus",[10836,10839],{"paper_id":10829,"author_seq":247,"given_name":10837,"surname":10838,"affiliation":63,"orcid":63},"Holger","Schwenk",{"paper_id":10829,"author_seq":232,"given_name":9749,"surname":1591,"affiliation":63,"orcid":63},"Cross-lingual document classification aims at training a document classifier on resources in one language and transferring it to a different language without any additional resources. Several approaches have been proposed in the literature and the current best practice is to evaluate them on a subset of the Reuters Corpus Volume 2. However, this subset covers only few languages (English, German, French and Spanish) and almost all published works focus on the the transfer between English and German.  In addition, we have observed that the class prior distributions differ significantly between the languages. We argue that this complicates the evaluation of the multilinguality. In this paper, we propose a new subset of the Reuters corpus with balanced class priors for eight languages. By adding Italian, Russian, Japanese and Chinese, we cover languages which are very different with respect to syntax, morphology, etc.  We provide strong baselines for all language transfer directions using multilingual word and sentence embeddings respectively. Our goal is to offer a freely available framework to evaluate cross-lingual document classification, and we hope to foster by these means, research in this important area.",{"paper_id":10842,"title":10843,"year":81,"month":855,"day":63,"doi":10844,"resource_url":10845,"first_page":63,"last_page":63,"pdf_url":10846,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10847,"paper_type":860,"authors":10848,"abstract":10856},"lrec2018-main-561","Analyzing Citation-Distance Networks for Evaluating Publication Impact","10.63317\u002F535zcpzodxsw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-561","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F782.pdf","herrmannova-etal-2018-analyzing",[10849,10852,10854],{"paper_id":10842,"author_seq":247,"given_name":10850,"surname":10851,"affiliation":63,"orcid":63},"Drahomira","Herrmannova",{"paper_id":10842,"author_seq":232,"given_name":6915,"surname":10853,"affiliation":63,"orcid":63},"Knoth",{"paper_id":10842,"author_seq":218,"given_name":3376,"surname":10855,"affiliation":63,"orcid":63},"Patton","Studying citation patterns of scholarly articles has been of interest to many researchers from various disciplines. While the relationship of citations and scientific impact has been widely studied in the literature, in this paper we develop the idea of analyzing the semantic distance of scholarly articles in a citation network (citation-distance network) to uncover patterns that reflect scientific impact. More specifically, we compare two types of publications in terms of their citation-distance patterns, seminal publications and literature reviews, and focus on their referencing patterns as well as on publications which cite them. We show that seminal publications are associated with a larger semantic distance, measured using the content of the articles, between their references and the citing publications, while literature reviews tend to cite publications from a wider range of topics. Our motivation is to understand and utilize this information to create new research evaluation metrics which would better reflect scientific impact.",{"paper_id":10858,"title":10859,"year":81,"month":855,"day":63,"doi":10860,"resource_url":10861,"first_page":63,"last_page":63,"pdf_url":10862,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10863,"paper_type":860,"authors":10864,"abstract":10869},"lrec2018-main-562","Annotating Educational Questions for Student Response Analysis","10.63317\u002F4hjqyvt6vw49","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-562","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1001.pdf","godea-nielsen-2018-annotating",[10865,10868],{"paper_id":10858,"author_seq":247,"given_name":10866,"surname":10867,"affiliation":63,"orcid":63},"Andreea","Godea",{"paper_id":10858,"author_seq":232,"given_name":5345,"surname":5346,"affiliation":63,"orcid":63},"Questions play an important role in the educational domain, representing the main form of interaction between instructors and students. In this paper, we introduce the first taxonomy and annotated educational corpus of questions that aims to help with the analysis of student responses. The dataset can be employed in approaches that classify questions based on the expected answer types. This can be an important component in applications that require prior knowledge about the desired answer to a given question, such as educational and question answering systems. To demonstrate the applicability and the effectiveness of the data within approaches to classify questions based on expected answer types, we performed extensive experiments on our dataset using a neural network with word embeddings as features. The approach achieved a weighted F1-score of 0.511, overcoming the baseline by 12%. This demonstrates that our corpus can be effectively integrated in simple approaches that classify questions based on the response type.",{"paper_id":10871,"title":10872,"year":81,"month":855,"day":63,"doi":10873,"resource_url":10874,"first_page":63,"last_page":63,"pdf_url":10875,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10876,"paper_type":860,"authors":10877,"abstract":10881},"lrec2018-main-563","Incorporating Global Contexts into Sentence Embedding for Relational Extraction at the Paragraph Level with Distant Supervision","10.63317\u002F2twfm22v6xub","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-563","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F158.pdf","kim-choi-2018-incorporating",[10878,10880],{"paper_id":10871,"author_seq":247,"given_name":10879,"surname":1104,"affiliation":63,"orcid":63},"Eun-kyung",{"paper_id":10871,"author_seq":232,"given_name":1109,"surname":1110,"affiliation":63,"orcid":63},"The increased demand for structured knowledge has created considerable interest in relation extraction (RE) from large collections of documents.  In particular, distant supervision can be used for RE without manual annotation costs.  Nevertheless, this paradigm only extracts relations from individual sentences that contain two target entities.  This paper explores the incorporation of global contexts derived from paragraph-into-sentence embedding as a means of compensating for the shortage of training data in distantly supervised RE.  Experiments on RE from Korean Wikipedia show that the presented approach can learn an exact RE from sentences (including grammatically incoherent sentences) without syntactic parsing.",{"paper_id":10883,"title":10884,"year":81,"month":855,"day":63,"doi":10885,"resource_url":10886,"first_page":63,"last_page":63,"pdf_url":10887,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10888,"paper_type":860,"authors":10889,"abstract":10895},"lrec2018-main-564","MCScript: A Novel Dataset for Assessing Machine Comprehension Using Script Knowledge","10.63317\u002F2x36g84ob95p","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-564","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F225.pdf","ostermann-etal-2018-mcscript",[10890,10891,10892,10893,10894],{"paper_id":10883,"author_seq":247,"given_name":1269,"surname":9982,"affiliation":63,"orcid":63},{"paper_id":10883,"author_seq":232,"given_name":1055,"surname":1056,"affiliation":63,"orcid":63},{"paper_id":10883,"author_seq":218,"given_name":1780,"surname":2622,"affiliation":63,"orcid":63},{"paper_id":10883,"author_seq":203,"given_name":1471,"surname":9987,"affiliation":63,"orcid":63},{"paper_id":10883,"author_seq":188,"given_name":1058,"surname":1059,"affiliation":63,"orcid":63},"We introduce a large dataset of narrative texts and questions about these texts, intended to be used in a machine comprehension task that requires reasoning using commonsense knowledge. Our dataset complements similar datasets in that we focus on stories about everyday activities, such as going to the movies or working in the garden, and that the questions require commonsense knowledge, or more specifically, script knowledge, to be answered. We show that our mode of data collection via crowdsourcing results in a substantial amount of such inference questions. The dataset forms the basis of a shared task on commonsense and script knowledge organized at SemEval 2018 and provides challenging test cases for the broader natural language understanding community.",{"paper_id":10897,"title":10898,"year":81,"month":855,"day":63,"doi":10899,"resource_url":10900,"first_page":63,"last_page":63,"pdf_url":10901,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10902,"paper_type":860,"authors":10903,"abstract":10914},"lrec2018-main-565","A Neural Network Based Model for Loanword Identification in Uyghur","10.63317\u002F337ucdsbkcih","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-565","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F291.pdf","mi-etal-2018-neural",[10904,10907,10909,10910,10912],{"paper_id":10897,"author_seq":247,"given_name":10905,"surname":10906,"affiliation":63,"orcid":63},"Chenggang","Mi",{"paper_id":10897,"author_seq":232,"given_name":10908,"surname":3503,"affiliation":63,"orcid":63},"Yating",{"paper_id":10897,"author_seq":218,"given_name":1221,"surname":1588,"affiliation":63,"orcid":63},{"paper_id":10897,"author_seq":203,"given_name":10911,"surname":2438,"affiliation":63,"orcid":63},"Xi",{"paper_id":10897,"author_seq":188,"given_name":10913,"surname":6441,"affiliation":63,"orcid":63},"Tonghai","Lexical borrowing happens in almost all languages. To obtain more bilingual knowledge from monolingual corpora, we propose a neural network based loanword identification model for Uyghur. We build our model on a bidirectional LSTM - CNN framework, which can capture past and future information effectively and learn both word level and character level features from training data automatically. To overcome data sparsity that exists in model training, we also suggest three additional features , such as hybrid language model feature, pronunciation similarity feature and part-of-speech tagging feature to further improve the performance of our proposed approach. We conduct experiments on Chinese, Arabic and Russian loanword detection in Uyghur. Experimental results show that our proposed method outperforms several baseline models.",{"paper_id":10916,"title":10917,"year":81,"month":855,"day":63,"doi":10918,"resource_url":10919,"first_page":63,"last_page":63,"pdf_url":10920,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10921,"paper_type":860,"authors":10922,"abstract":10929},"lrec2018-main-566","Revisiting Distant Supervision for Relation Extraction","10.63317\u002F2frw8qrccomt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-566","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F414.pdf","jiang-etal-2018-revisiting",[10923,10925,10926,10928],{"paper_id":10916,"author_seq":247,"given_name":10924,"surname":6441,"affiliation":63,"orcid":63},"Tingsong",{"paper_id":10916,"author_seq":232,"given_name":8129,"surname":5232,"affiliation":63,"orcid":63},{"paper_id":10916,"author_seq":218,"given_name":10927,"surname":9639,"affiliation":63,"orcid":63},"Chin-Yew",{"paper_id":10916,"author_seq":203,"given_name":2431,"surname":2432,"affiliation":63,"orcid":63},"Distant supervision has been widely used in the task of relation extraction (RE). However, when we carefully examine the experimental settings of previous work, we find two issues: (i) The compared models were trained on different training datasets. (ii) The existing testing data contains noise and bias issues. These issues may affect the conclusions in previous work. In this paper, our primary aim is to re-examine the distant supervision-based approaches under the experimental settings without the above issues. We approach this by training models on the same dataset and creating a new testing dataset annotated by the workers on Amzaon Mechanical Turk. We draw new conclusions based on the new testing dataset. The new testing data can be obtained from http:\u002F\u002Faka.ms\u002Frelationie.",{"paper_id":10931,"title":10932,"year":81,"month":855,"day":63,"doi":10933,"resource_url":10934,"first_page":63,"last_page":63,"pdf_url":10935,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10936,"paper_type":860,"authors":10937,"abstract":10945},"lrec2018-main-567","Incorporating Contextual Information for Language-Independent, Dynamic Disambiguation Tasks","10.63317\u002F5hojorwshoo2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-567","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F289.pdf","staron-etal-2018-incorporating",[10938,10940,10943],{"paper_id":10931,"author_seq":247,"given_name":8141,"surname":10939,"affiliation":63,"orcid":63},"Staron",{"paper_id":10931,"author_seq":232,"given_name":10941,"surname":10942,"affiliation":63,"orcid":63},"Özge","Alaçam",{"paper_id":10931,"author_seq":218,"given_name":1251,"surname":10944,"affiliation":63,"orcid":63},"Menzel","Humans resolve various kinds of linguistic ambiguities by exploiting available external evidence that has been acquired from modalities besides the linguistic one. This behavior can be observed for several languages, for English or German for example. In contrast, most natural language processing systems, parsers for example, rely on linguistic information only without taking further knowledge into account. While those systems are expected to correctly handle syntactically unambiguous cases, they cannot resolve syntactic ambiguities reliably. This paper hypothesizes that parsers would be able to find non-canonical interpretations of ambiguous sentences, if they exploited external, contextual information. The proposed multi-modal system, which combines data-driven and grammar-based approaches, confirmed this hypothesis in experiments on syntactically ambiguous sentences. This work focuses on the scarcely investigated relative clause attachment ambiguity instead of prepositional phrase attachment ambiguities, which are already well known in the literature. Experiments were conducted for English, German and Turkish and dynamic, i. e. highly dissimilar, contexts.",{"paper_id":10947,"title":10948,"year":81,"month":855,"day":63,"doi":10949,"resource_url":10950,"first_page":63,"last_page":63,"pdf_url":10951,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10952,"paper_type":860,"authors":10953,"abstract":10959},"lrec2018-main-568","Overcoming the Long Tail Problem: A Case Study on CO2-Footprint Estimation of Recipes using Information Retrieval","10.63317\u002F4swce3bd85zg","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-568","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F485.pdf","geiger-braschler-2018-overcoming",[10954,10957],{"paper_id":10947,"author_seq":247,"given_name":10955,"surname":10956,"affiliation":63,"orcid":63},"Melanie","Geiger",{"paper_id":10947,"author_seq":232,"given_name":1248,"surname":10958,"affiliation":63,"orcid":63},"Braschler","We propose approaches that use information retrieval methods for the automatic calculation of CO2-footprints of cooking recipes. A particular challenge is the \"long tail problem\" that arises with the large diversity of possible ingredients. The proposed approaches are generalizable to other use cases in which a numerical value for semi-structured items has to be calculated, for example, the calculation of the insurance value of a property based on a real estate listing. Our first approach, ingredient matching, calculates the CO2-footprint based on the ingredient descriptions that are matched to food products in a language resource and therefore suffers from the long tail problem. On the other hand, our second approach directly uses the recipe to estimate the CO2-value based on its closest neighbor using an adapted version of the BM25 weighting scheme. Furthermore, we combine these two approaches in order to achieve a more reliable estimate. Our experiments show that the automatically calculated CO2-value estimates lie within an acceptable range compared to the manually calculated values. Therefore, the costs of the calculation of the CO2-footprints can be reduced dramatically by using the automatic approaches. This helps to make the information available to a large audience in order to increase the awareness and transparency of the environmental impact of food consumption.",{"paper_id":10961,"title":10962,"year":81,"month":855,"day":63,"doi":10963,"resource_url":10964,"first_page":63,"last_page":63,"pdf_url":10965,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10966,"paper_type":860,"authors":10967,"abstract":10972},"lrec2018-main-569","Comparison of Pun Detection Methods Using Japanese Pun Corpus","10.63317\u002F53gyoi8m3vej","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-569","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F587.pdf","yatsu-araki-2018-comparison",[10968,10971],{"paper_id":10961,"author_seq":247,"given_name":10969,"surname":10970,"affiliation":63,"orcid":63},"Motoki","Yatsu",{"paper_id":10961,"author_seq":232,"given_name":2474,"surname":5457,"affiliation":63,"orcid":63},"A sampling survey of typology and component ratio analysis in Japanese puns revealed that the type of Japanese pun that had the largest proportion was a pun type with two sound sequences, whose consonants are phonetically close to each other in the same sentence which includes the pun. Based on this finding, we constructed rules to detect pairs of phonetically similar sequences as features for a supervised machine learning classifier. Using these features in addition to Bag-of-Words features, an evaluation experiment confirmed the effectiveness of adding the rule-based features to the baseline.",{"paper_id":10974,"title":10975,"year":81,"month":855,"day":63,"doi":10976,"resource_url":10977,"first_page":63,"last_page":63,"pdf_url":10978,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10979,"paper_type":860,"authors":10980,"abstract":10990},"lrec2018-main-570","A vision-grounded dataset for predicting typical locations for verbs","10.63317\u002F45ptofymouuj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-570","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1089.pdf","mukuze-etal-2018-vision",[10981,10984,10986,10987],{"paper_id":10974,"author_seq":247,"given_name":10982,"surname":10983,"affiliation":63,"orcid":63},"Nelson","Mukuze",{"paper_id":10974,"author_seq":232,"given_name":884,"surname":10985,"affiliation":63,"orcid":63},"Rohrbach",{"paper_id":10974,"author_seq":218,"given_name":9594,"surname":9595,"affiliation":63,"orcid":63},{"paper_id":10974,"author_seq":203,"given_name":10988,"surname":10989,"affiliation":63,"orcid":63},"Bernt","Schiele","Information about the location of an action is often implicit in text, as humans can infer it based on common sense knowledge. Today’s NLP systems however struggle with inferring information that goes beyond what is explicit in text. Selectional preference estimation based on large amounts of data provides a way to infer prototypical role ﬁllers, but text-based systems tend to underestimate the probability of the most typical role ﬁllers. We here present a new dataset containing thematic ﬁt judgments for 2,000 verb\u002Flocation pairs. This dataset can be used for evaluating text-based, vision-based or multimodal inference systems for the typicality of an event’s location. We additionally provide three thematic ﬁt baselines for this dataset: a state-of-the-art neural networks based thematic ﬁt model learned from linguistic data, a model estimating typical locations based on the MSCOCO dataset and a simple combination of the systems.",{"paper_id":10992,"title":10993,"year":81,"month":855,"day":63,"doi":10994,"resource_url":10995,"first_page":63,"last_page":63,"pdf_url":10996,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":10997,"paper_type":860,"authors":10998,"abstract":11003},"lrec2018-main-571","Creating dialect sub-corpora by clustering: a case in Japanese for an adaptive method","10.63317\u002F388yq5ns8e45","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-571","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F142.pdf","sato-heffernan-2018-creating",[10999,11001],{"paper_id":10992,"author_seq":247,"given_name":2368,"surname":11000,"affiliation":63,"orcid":63},"Sato",{"paper_id":10992,"author_seq":232,"given_name":2184,"surname":11002,"affiliation":63,"orcid":63},"Heffernan","We propose a pipeline through which to derive clusters of dialects, given a mixed corpus composed of di erent dialects, when their standard counterpart is su ciently resourced. The test case is Japanese, where the written standard language is su ciently equipped with adequate resources. Our method starts by detecting non-standard contents  rst, and then clusters what is deemed dialectal. We report the results on the clustering of mixed Twitter corpus into four dialects (Kansai, Tohoku, Chugoku and Kyushu).",{"paper_id":11005,"title":11006,"year":81,"month":855,"day":63,"doi":11007,"resource_url":11008,"first_page":63,"last_page":63,"pdf_url":11009,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11010,"paper_type":860,"authors":11011,"abstract":11020},"lrec2018-main-572","A Fast and Flexible Webinterface for Dialect Research in the Low Countries","10.63317\u002F3jtsqr2rmpkq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-572","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F220.pdf","van-hout-etal-2018-fast",[11012,11015,11018,11019],{"paper_id":11005,"author_seq":247,"given_name":11013,"surname":11014,"affiliation":63,"orcid":63},"Roeland","van Hout",{"paper_id":11005,"author_seq":232,"given_name":11016,"surname":11017,"affiliation":63,"orcid":63},"Nicoline","van der Sijs",{"paper_id":11005,"author_seq":218,"given_name":4617,"surname":4618,"affiliation":63,"orcid":63},{"paper_id":11005,"author_seq":203,"given_name":4614,"surname":4615,"affiliation":63,"orcid":63},"This paper describes the development of webportals with search applications built in order to make the data from the 33 volumes of the Dictionary of the Brabantic dialects (1967-2005) and the 39 volumes of the Dictionary of the Limburgian dialects (1983-2008) accessible and retrievable for both the research community and the general audience. Part of the data was available in a digital format, a larger part only in print. The printed data was semi-automatically converted from paper to structured text (database). This process allowed for streamlining information, applying (semi-)automatic data checks and manually correcting the input. Next, the resulting database was the backbone of a webportal for faceted search requests on the full collection, including filtering and splitting the results on metadata. The design and implementation of the webportals, called e-WBD and e-WLD, are being defined in more detail. The URLs of the portals are: http:\u002F\u002Fe-wbd.nl\u002F and http:\u002F\u002Fwww.e-wld.nl\u002F.",{"paper_id":11022,"title":11023,"year":81,"month":855,"day":63,"doi":11024,"resource_url":11025,"first_page":63,"last_page":63,"pdf_url":11026,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11027,"paper_type":860,"authors":11028,"abstract":11036},"lrec2018-main-573","Arabic Dialect Identification in the Context of Bivalency and Code-Switching","10.63317\u002F56ct2dgra5b3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-573","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F237.pdf","el-haj-etal-2018-arabic",[11029,11032,11033],{"paper_id":11022,"author_seq":247,"given_name":11030,"surname":11031,"affiliation":63,"orcid":63},"Mahmoud","El-Haj",{"paper_id":11022,"author_seq":232,"given_name":3690,"surname":3833,"affiliation":63,"orcid":63},{"paper_id":11022,"author_seq":218,"given_name":11034,"surname":11035,"affiliation":63,"orcid":63},"Mariam","Aboelezz","In this paper we use a novel approach towards Arabic dialect identification using language bivalency and written code-switching. Bivalency between languages or dialects is where a word or element is treated by language users as having a fundamentally similar semantic content in more than one language or dialect. Arabic dialect identification in writing is a difficult task even for humans due to the fact that words are used interchangeably between dialects. The task of automatically identifying dialect is harder and classifiers trained using only n-grams will perform poorly when tested on unseen data. Such approaches require significant amounts of annotated training data which is costly and time consuming to produce. Currently available Arabic dialect datasets do not exceed a few hundred thousand sentences, thus we need to extract features other than word and character n-grams. In our work we present experimental results from automatically identifying dialects from the four main Arabic dialect regions (Egypt, North Africa, Gulf and Levant) in addition to Standard Arabic. We extend previous work by incorporating additional grammatical and stylistic features and define a subtractive bivalency profiling approach to address issues of bivalent words across the examined Arabic dialects. The results show that our new methods classification accuracy can reach more than 76% and score well (66%) when tested on completely unseen data.",{"paper_id":11038,"title":11039,"year":81,"month":855,"day":63,"doi":11040,"resource_url":11041,"first_page":63,"last_page":63,"pdf_url":11042,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11043,"paper_type":860,"authors":11044,"abstract":11072},"lrec2018-main-574","Unified Guidelines and Resources for Arabic Dialect Orthography","10.63317\u002F2k835qnyjdtw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-574","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F395.pdf","habash-etal-2018-unified",[11045,11046,11047,11048,11049,11050,11051,11054,11055,11056,11059,11061,11064,11067,11069,11070,11071],{"paper_id":11038,"author_seq":247,"given_name":3646,"surname":3647,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":232,"given_name":10438,"surname":10439,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":218,"given_name":8340,"surname":8341,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":203,"given_name":1727,"surname":10431,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":188,"given_name":10433,"surname":10434,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":172,"given_name":2736,"surname":10441,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":155,"given_name":11052,"surname":11053,"affiliation":63,"orcid":63},"Reem","Faraj",{"paper_id":11038,"author_seq":138,"given_name":3042,"surname":3043,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":121,"given_name":8344,"surname":8345,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":104,"given_name":11057,"surname":11058,"affiliation":63,"orcid":63},"Nasser","Zalmout",{"paper_id":11038,"author_seq":87,"given_name":2650,"surname":11060,"affiliation":63,"orcid":63},"Hassan",{"paper_id":11038,"author_seq":73,"given_name":11062,"surname":11063,"affiliation":63,"orcid":63},"Faisal","Al-Shargi",{"paper_id":11038,"author_seq":55,"given_name":11065,"surname":11066,"affiliation":63,"orcid":63},"Sakhar","Alkhereyf",{"paper_id":11038,"author_seq":38,"given_name":8649,"surname":11068,"affiliation":63,"orcid":63},"Abdulkareem",{"paper_id":11038,"author_seq":17,"given_name":4466,"surname":4467,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":2594,"given_name":1437,"surname":10428,"affiliation":63,"orcid":63},{"paper_id":11038,"author_seq":2597,"given_name":7485,"surname":7486,"affiliation":63,"orcid":63},"We present a unified set of guidelines and resources for   conventional orthography of dialectal Arabic.  While Standard Arabic   has well defined orthographic standards, none of the Arabic dialects   do today.  Previous efforts on conventionalizing the dialectal   orthography have focused on specific dialects and made often ad hoc   decisions. In this work, we present a common set of guidelines and   meta-guidelines and apply them to 28 Arab city dialects from Rabat   to Muscat.  These guidelines and their connected resources are being   used by three large Arabic dialect processing projects in three   universities.",{"paper_id":11074,"title":11075,"year":81,"month":855,"day":63,"doi":11076,"resource_url":11077,"first_page":63,"last_page":63,"pdf_url":11078,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11079,"paper_type":860,"authors":11080,"abstract":11090},"lrec2018-main-575","Automatic Identification of Maghreb Dialects Using a Dictionary-Based Approach","10.63317\u002F3a4wixcre2vd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-575","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F580.pdf","saadane-etal-2018-automatic",[11081,11083,11086,11088,11089],{"paper_id":11074,"author_seq":247,"given_name":8344,"surname":11082,"affiliation":63,"orcid":63},"Saâdane",{"paper_id":11074,"author_seq":232,"given_name":11084,"surname":11085,"affiliation":63,"orcid":63},"Hosni","Seffih",{"paper_id":11074,"author_seq":218,"given_name":904,"surname":11087,"affiliation":63,"orcid":63},"Fluhr",{"paper_id":11074,"author_seq":203,"given_name":1320,"surname":1321,"affiliation":63,"orcid":63},{"paper_id":11074,"author_seq":188,"given_name":1825,"surname":1826,"affiliation":63,"orcid":63},"Automatic identification of Arabic dialects in a text is a difficult task, especially for Maghreb languages and when they are written in Arabic or Latin characters (Arabizi). These texts are characterized by the use of code-switching between the Modern Standard Arabic (MSA) and the Arabic Dialect (AD) in the texts written in Arabic, or between Arabizi and foreign languages for those written in Latin.  This paper presents the specific resources and tools we have developed for this purpose, with a focus on the transliteration of Arabizi into Arabic (using the dedicated tools for Arabic dialects). A dictionary-based approach to detect the dialectal origin of a text is described, it exhibits satisfactory results.",{"paper_id":11092,"title":11093,"year":81,"month":855,"day":63,"doi":11094,"resource_url":11095,"first_page":63,"last_page":63,"pdf_url":11096,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11097,"paper_type":860,"authors":11098,"abstract":11110},"lrec2018-main-576","Shami: A Corpus of Levantine Arabic Dialects","10.63317\u002F4i96mxb3ymx8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-576","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F820.pdf","abu-kwaik-etal-2018-shami",[11099,11102,11105,11108],{"paper_id":11092,"author_seq":247,"given_name":11100,"surname":11101,"affiliation":63,"orcid":63},"Kathrein","Abu Kwaik",{"paper_id":11092,"author_seq":232,"given_name":11103,"surname":11104,"affiliation":63,"orcid":63},"Motaz","Saad",{"paper_id":11092,"author_seq":218,"given_name":11106,"surname":11107,"affiliation":63,"orcid":63},"Stergios","Chatzikyriakidis",{"paper_id":11092,"author_seq":203,"given_name":1269,"surname":11109,"affiliation":63,"orcid":63},"Dobnik","Modern Standard Arabic (MSA) is the official language used in education and media across the Arab world both in writing and formal speech. However, in daily communication several dialects depending on the country, region as well as other social factors, are used. With the emergence of social media, the dialectal amount of data on the Internet have increased and the NLP tools that support MSA are not well-suited to process this data due to the difference between the dialects and MSA. In this paper, we construct the Shami corpus, the first Levantine Dialect Corpus (SDC) covering data from the four dialects spoken in Palestine, Jordan, Lebanon and Syria. We also describe rules for pre-processing without affecting the meaning so that it is processable by NLP tools. We choose Dialect Identification as the task to evaluate SDC and compare it with two other corpora. In this respect, experiments are conducted using different parameters based on n-gram models and Naive Bayes classifiers. SDC is larger than the existing corpora in terms of size, words and vocabularies. In addition, we use the performance on the Language Identification task to exemplify the similarities and differences in the individual dialects",{"paper_id":11112,"title":11113,"year":81,"month":855,"day":63,"doi":11114,"resource_url":11115,"first_page":63,"last_page":63,"pdf_url":11116,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11117,"paper_type":860,"authors":11118,"abstract":11126},"lrec2018-main-577","You Tweet What You Speak: A City-Level Dataset of Arabic Dialects","10.63317\u002F3wos9hprfv6b","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-577","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F929.pdf","abdul-mageed-etal-2018-tweet",[11119,11122,11124],{"paper_id":11112,"author_seq":247,"given_name":11120,"surname":11121,"affiliation":63,"orcid":63},"Muhammad","Abdul-Mageed",{"paper_id":11112,"author_seq":232,"given_name":11060,"surname":11123,"affiliation":63,"orcid":63},"Alhuzali",{"paper_id":11112,"author_seq":218,"given_name":1157,"surname":11125,"affiliation":63,"orcid":63},"Elaraby","Arabic has a wide range of varieties or dialects. Although a number of pioneering works have targeted some Arabic dialects, other dialects remain largely without investigation. A serious bottleneck for studying these dialects is lack of any data that can be exploited in computational models. In this work, we aim to bridge this gap: We present a considerably large dataset of > 1=4 billion tweets representing a wide range of dialects. Our dataset is more nuanced than previously reported work in that it is labeled at the fine-grained level of city. More specifically, the data represent 29 major Arab cities from 10 Arab countries with varying dialects (e.g., Egyptian, Gulf, KSA, Levantine, Yemeni).",{"paper_id":11128,"title":11129,"year":81,"month":855,"day":63,"doi":11130,"resource_url":11131,"first_page":63,"last_page":63,"pdf_url":11132,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11133,"paper_type":860,"authors":11134,"abstract":11136},"lrec2018-main-578","Visualizing the “Dictionary of Regionalisms of France” (DRF)","10.63317\u002F3wfuzkec5d5g","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-578","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1005.pdf","wan-2018-visualizing",[11135],{"paper_id":11128,"author_seq":247,"given_name":2787,"surname":2788,"affiliation":63,"orcid":63},"This paper presents CorpusDRF, an open-source, digitized collection of regionalisms, their parts of speech and recognition rates, published in 'Dictionnaire des Regionalismes de France' (DRF, \"Dictionary of Regionalisms of France\") (Rezeau, 2001), enabling the visualization and analyses of the largest-scale study of French regionalisms in the 20th century using publicly available data. CorpusDRF was curated and checked manually against the entirety of the printed volume of more than 1000 pages. It contains all the entries in the DRF for which recognition rates in continental France were recorded from the surveys carried out from 1994 to 1996 and from 1999 to 2000. In this paper, in addition to introducing the corpus, we also offer some exploratory visualizations using an easy-to-use, freely available web application and compare the patterns in our analysis with that by (Goebl, 2005a) and (Goebl, 2007).",{"paper_id":11138,"title":11139,"year":81,"month":855,"day":63,"doi":11140,"resource_url":11141,"first_page":63,"last_page":63,"pdf_url":11142,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11143,"paper_type":860,"authors":11144,"abstract":11155},"lrec2018-main-579","DART: A Large Dataset of Dialectal Arabic Tweets","10.63317\u002F4b9q46p2vs9o","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-579","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1013.pdf","alsarsour-etal-2018-dart",[11145,11148,11150,11152],{"paper_id":11138,"author_seq":247,"given_name":11146,"surname":11147,"affiliation":63,"orcid":63},"Israa","Alsarsour",{"paper_id":11138,"author_seq":232,"given_name":11149,"surname":1157,"affiliation":63,"orcid":63},"Esraa",{"paper_id":11138,"author_seq":218,"given_name":11052,"surname":11151,"affiliation":63,"orcid":63},"Suwaileh",{"paper_id":11138,"author_seq":203,"given_name":11153,"surname":11154,"affiliation":63,"orcid":63},"Tamer","Elsayed","In this paper, we present a new large manually-annotated multi-dialect dataset of Arabic tweets that is publicly available. The Dialectal ARabic Tweets (DART) dataset has about 25K tweets that are annotated via crowdsourcing and it is well-balanced over five main groups of Arabic dialects: Egyptian, Maghrebi, Levantine, Gulf, and Iraqi. The paper outlines the pipeline of constructing the dataset from crawling tweets that match a list of dialect phrases to annotating the tweets by the crowd. We also touch some challenges that we face during the process. We evaluate the quality of the dataset from two perspectives: the inter-annotator agreement and the accuracy of the final labels. Results show that both measures were substantially high for the Egyptian, Gulf, and Levantine dialect groups, but lower for the Iraqi and Maghrebi dialects, which indicates the difficulty of identifying those two dialects manually and hence automatically.",{"paper_id":11157,"title":11158,"year":81,"month":855,"day":63,"doi":11159,"resource_url":11160,"first_page":63,"last_page":63,"pdf_url":11161,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11162,"paper_type":860,"authors":11163,"abstract":11167},"lrec2018-main-580","Classification of Closely Related Sub-dialects of Arabic Using Support-Vector Machines","10.63317\u002F5knmymeqsxge","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-580","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1104.pdf","wray-2018-classification",[11164],{"paper_id":11157,"author_seq":247,"given_name":11165,"surname":11166,"affiliation":63,"orcid":63},"Samantha","Wray","Colloquial dialects of Arabic can be roughly categorized into five groups based n relatedness and geographic location (Egyptian, North African\u002FMaghrebi, Gulf, Iraqi, and Levantine), but given that all dialects utilize much of the same writing system and share overlapping features and vocabulary, dialect identification and text classification is no trivial task. Furthermore, text classification by dialect is often performed at a coarse-grained level into these five groups or a subset thereof, and there is little work on sub-dialectal classification. The current study utilizes an n-gram based SVM to classify on a fine-grained sub-dialectal level, and compares it to methods used in dialect classification such as vocabulary pruning of shared items across dialects. A test case of the dialect Levantine is presented here, and results of 65% accuracy on a four-way classification experiment to sub-dialects of Levantine (Jordanian, Lebanese, Palestinian and Syrian) are presented and discussed. This paper also examines the possibility of leveraging existing mixed-dialectal resources to determine their sub-dialectal makeup by automatic classification.",{"paper_id":11169,"title":11170,"year":81,"month":855,"day":63,"doi":11171,"resource_url":11172,"first_page":63,"last_page":63,"pdf_url":11173,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11174,"paper_type":860,"authors":11175,"abstract":11178},"lrec2018-main-581","Page Stream Segmentation with Convolutional Neural Nets Combining Textual and Visual Features","10.63317\u002F3syaqgcgs7bs","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-581","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F628.pdf","wiedemann-heyer-2018-page",[11176,11177],{"paper_id":11169,"author_seq":247,"given_name":4739,"surname":4740,"affiliation":63,"orcid":63},{"paper_id":11169,"author_seq":232,"given_name":4745,"surname":4746,"affiliation":63,"orcid":63},"In recent years, (retro-)digitizing paper-based files became a major undertaking for private and public archives as well as an important task in electronic mailroom applications. As a first step, the workflow involves scanning and Optical Character Recognition (OCR) of documents. Preservation of document contexts of single page scans is a major requirement in this context. To facilitate workflows involving very large amounts of paper scans, page stream segmentation (PSS) is the task to automatically separate a stream of scanned images into multi-page documents. In a digitization project together with a German federal archive, we developed a novel approach based on convolutional neural networks (CNN) combining image and text features to achieve optimal document separation results. Evaluation shows that our PSS architecture achieves an accuracy up to 93 % which can be regarded as a new state-of-the-art for this task.",{"paper_id":11180,"title":11181,"year":81,"month":855,"day":63,"doi":11182,"resource_url":11183,"first_page":63,"last_page":63,"pdf_url":11184,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11185,"paper_type":860,"authors":11186,"abstract":11194},"lrec2018-main-582","Automating Document Discovery in the Systematic Review Process: How to Use Chaff to Extract Wheat","10.63317\u002F5429ipy5fzad","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-582","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F777.pdf","norman-etal-2018-automating",[11187,11189,11192,11193],{"paper_id":11180,"author_seq":247,"given_name":1359,"surname":11188,"affiliation":63,"orcid":63},"Norman",{"paper_id":11180,"author_seq":232,"given_name":11190,"surname":11191,"affiliation":63,"orcid":63},"Mariska","Leeflang",{"paper_id":11180,"author_seq":218,"given_name":1388,"surname":1389,"affiliation":63,"orcid":63},{"paper_id":11180,"author_seq":203,"given_name":1403,"surname":1404,"affiliation":63,"orcid":63},"Systematic reviews in e.g. empirical medicine address research questions by comprehensively examining the entire published literature. Conventionally, manual literature surveys decide inclusion in two steps, first based on abstracts and title, then by full text, yet current methods to automate the process make no distinction between gold data from these two stages. In this work we compare the impact different schemes for choosing positive and negative examples from the different screening stages have on the training of automated systems. We train a ranker using logistic regression and evaluate it on a new gold standard dataset for clinical NLP, and on an existing gold standard dataset for drug class efficacy. The classification and ranking achieves an average AUC of 0.803 and 0.768 when relying on gold standard decisions based on title and abstracts of articles, and an AUC of 0.625 and 0.839 when relying on gold standard decisions based on full text. Our results suggest that it makes little difference which screening stage the gold standard decisions are drawn from, and that the decisions need not be based on the full text. The results further suggest that common-off-the-shelf algorithms can reduce the amount of work required to retrieve relevant literature.",{"paper_id":11196,"title":11197,"year":81,"month":855,"day":63,"doi":11198,"resource_url":11199,"first_page":63,"last_page":63,"pdf_url":11200,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11201,"paper_type":860,"authors":11202,"abstract":11208},"lrec2018-main-583","Two Multilingual Corpora Extracted from the Tenders Electronic Daily for Machine Learning and Machine Translation Applications.","10.63317\u002F53r8rnqbanag","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-583","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F832.pdf","ahmia-etal-2018-two",[11203,11206,11207],{"paper_id":11196,"author_seq":247,"given_name":11204,"surname":11205,"affiliation":63,"orcid":63},"Oussama","Ahmia",{"paper_id":11196,"author_seq":232,"given_name":1641,"surname":1137,"affiliation":63,"orcid":63},{"paper_id":11196,"author_seq":218,"given_name":7061,"surname":5884,"affiliation":63,"orcid":63},"The European \"Tenders Electronic Daily\" (TED) is a large source of semi-structured and multilingual data that is very valuable to the Natural Language Processing community. This data sets can effectively be used to address complex machine translation, multilingual terminology extraction, text-mining, or to benchmark information retrieval systems. Despite of the services offered by the user-friendliness of the web site that is made available to the public to access the publishing of the EU call for tenders, collecting and managing such kind of data is a great burden and consumes a lot of time and computing resources. This could explain why such a resource is not very (if any) exploited today by computer scientists or engineers in NLP. The aim of this paper is to describe two documented and easy-to-use multilingual corpora (one of them is a parallel corpus), extracted from the TED web source that we will release for the benefit of the NLP community.",{"paper_id":11210,"title":11211,"year":81,"month":855,"day":63,"doi":11212,"resource_url":11213,"first_page":63,"last_page":63,"pdf_url":11214,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11215,"paper_type":860,"authors":11216,"abstract":11224},"lrec2018-main-584","Using Adversarial Examples in Natural Language Processing","10.63317\u002F54y9ty8zmf56","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-584","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F852.pdf","belohlavek-etal-2018-using",[11217,11219,11222,11223],{"paper_id":11210,"author_seq":247,"given_name":6915,"surname":11218,"affiliation":63,"orcid":63},"Bělohlávek",{"paper_id":11210,"author_seq":232,"given_name":11220,"surname":11221,"affiliation":63,"orcid":63},"Ondřej","Plátek",{"paper_id":11210,"author_seq":218,"given_name":6226,"surname":6227,"affiliation":63,"orcid":63},{"paper_id":11210,"author_seq":203,"given_name":5413,"surname":5414,"affiliation":63,"orcid":63},"Machine learning models have been providing promising results in many fields including natural language processing. These models are, nevertheless, prone to adversarial examples. These are artificially constructed examples which evince two main features: they resemble the real training data but they deceive already trained model. This paper investigates the effect of using adversarial examples during the training of recurrent neural networks whose text input is in the form of a sequence of word\u002Fcharacter embeddings. The effects are studied on a compilation of eight NLP datasets whose interface was unified for quick experimenting. Based on the experiments and the dataset characteristics, we conclude that using the adversarial examples for NLP tasks that are modeled by recurrent neural networks provides a regularization effect and enables the training of models with greater number of parameters without overfitting. In addition, we discuss which combinations of datasets and model settings might benefit from the adversarial training the most.",{"paper_id":11226,"title":11227,"year":81,"month":855,"day":63,"doi":11228,"resource_url":11229,"first_page":63,"last_page":63,"pdf_url":11230,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11231,"paper_type":860,"authors":11232,"abstract":11238},"lrec2018-main-585","Modeling Trolling in Social Media Conversations","10.63317\u002F3rqa4q7o6hoy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-585","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F909.pdf","mojica-de-la-vega-ng-2018-modeling",[11233,11236],{"paper_id":11226,"author_seq":247,"given_name":11234,"surname":11235,"affiliation":63,"orcid":63},"Luis Gerardo","Mojica de la Vega",{"paper_id":11226,"author_seq":232,"given_name":11237,"surname":908,"affiliation":63,"orcid":63},"Vincent","Social media websites, electronic newspapers and Internet forums allow visitors to leave comments for others to read and interact. This exchange is not free from participants with malicious intentions, who troll others by positing messages that are intended to be provocative, offensive, or menacing. With the goal of facilitating the computational modeling of trolling, we propose a trolling categorization that is novel in that it allows comment-based analysis from both the trolls' and the responders' perspectives, characterizing these two perspectives using four aspects, namely, the troll's intention and his intention disclosure, as well as the responder's interpretation of the troll's intention and her response strategy. Using this categorization, we annotate and release a dataset containing excerpts of Reddit conversations involving suspected trolls and their interactions with other users.",{"paper_id":11240,"title":11241,"year":81,"month":855,"day":63,"doi":11242,"resource_url":11243,"first_page":63,"last_page":63,"pdf_url":11244,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11245,"paper_type":860,"authors":11246,"abstract":11253},"lrec2018-main-586","Automatic Annotation of Semantic Term Types in the Complete ACL Anthology Reference Corpus","10.63317\u002F5g2jysot2ivo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-586","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F154.pdf","schumann-martinez-alonso-2018-automatic",[11247,11250],{"paper_id":11240,"author_seq":247,"given_name":11248,"surname":11249,"affiliation":63,"orcid":63},"Anne-Kathrin","Schumann",{"paper_id":11240,"author_seq":232,"given_name":11251,"surname":11252,"affiliation":63,"orcid":63},"Héctor","Martínez Alonso","In the present paper, we present an automated tagging approach aimed at enhancing a well-known resource, the ACL Anthology Reference Corpus, with semantic class labels for more than 20,000 technical terms that are relevant to the domain of computational linguistics. We use state-of-the-art classification techniques to assign semantic class labels to technical terms extracted from several reference term lists. We also sketch a set of research questions and approaches directed towards the integrated analysis of scientific corpora. To this end, we query the data set resulting from our annotation effort on both the term and the semantic class level level.",{"paper_id":11255,"title":11256,"year":81,"month":855,"day":63,"doi":11257,"resource_url":11258,"first_page":63,"last_page":63,"pdf_url":11259,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11260,"paper_type":860,"authors":11261,"abstract":11267},"lrec2018-main-587","Annotated Corpus of Scientific Conference’s Homepages for Information Extraction","10.63317\u002F59kzdmrqnwa3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-587","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F828.pdf","andruszkiewicz-hazan-2018-annotated",[11262,11264],{"paper_id":11255,"author_seq":247,"given_name":996,"surname":11263,"affiliation":63,"orcid":63},"Andruszkiewicz",{"paper_id":11255,"author_seq":232,"given_name":11265,"surname":11266,"affiliation":63,"orcid":63},"Rafał","Hazan","In this paper, we present a new corpus that contains 943 homepages of scientific conferences, 14794 including subpages, with annotations of interesting information: name of a conference, its abbreviation, place, and several important dates; that is, submission, notification, and camera ready dates. The topics of conferences included in the corpus are equally distributed over five areas: artificial intelligence, natural language processing, computer science, telecommunication, and image processing. The corpus is publicly available. Beside the characteristics of the corpus, we present the results of information extraction from the corpus using SVM and CRF models as we would like this corpus to be considered a reference data set for this type of task.",{"paper_id":11269,"title":11270,"year":81,"month":855,"day":63,"doi":11271,"resource_url":11272,"first_page":63,"last_page":63,"pdf_url":11273,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11274,"paper_type":860,"authors":11275,"abstract":11278},"lrec2018-main-588","Improving Unsupervised Keyphrase Extraction using Background Knowledge","10.63317\u002F2cuhndqpj2ht","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-588","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F871.pdf","yu-ng-2018-improving",[11276,11277],{"paper_id":11269,"author_seq":247,"given_name":3503,"surname":2406,"affiliation":63,"orcid":63},{"paper_id":11269,"author_seq":232,"given_name":11237,"surname":908,"affiliation":63,"orcid":63},"Keyphrase is an efficient representation of the main idea of documents. While background knowledge can provide valuable information about documents, they are rarely incorporated in keyphrase extraction methods. In this paper, we propose WikiRank, an unsupervised method for keyphrase extraction based on the background knowledge from Wikipedia. Firstly, we construct a semantic graph for the document. Then we transform the keyphrase extraction problem into an optimization problem on the graph. Finally, we get the optimal keyphrase set to be the output. Our method obtains improvements over other state-of-art models by more than 2% in F1-score.",{"paper_id":11280,"title":11281,"year":81,"month":855,"day":63,"doi":11282,"resource_url":11283,"first_page":63,"last_page":63,"pdf_url":11284,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11285,"paper_type":860,"authors":11286,"abstract":11294},"lrec2018-main-589","WikiDragon: A Java Framework For Diachronic Content And Network Analysis Of MediaWikis","10.63317\u002F22dusetayuow","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-589","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F905.pdf","gleim-etal-2018-wikidragon",[11287,11290,11291],{"paper_id":11280,"author_seq":247,"given_name":11288,"surname":11289,"affiliation":63,"orcid":63},"Rüdiger","Gleim",{"paper_id":11280,"author_seq":232,"given_name":2736,"surname":4012,"affiliation":63,"orcid":63},{"paper_id":11280,"author_seq":218,"given_name":11292,"surname":11293,"affiliation":63,"orcid":63},"Sung","Y. Song","We introduce WikiDragon, a Java Framework designed to give developers in computational linguistics an intuitive API to build, parse and analyze instances of MediaWikis such as Wikipedia, Wiktionary or WikiSource on their computers. It covers current versions of pages as well as the complete revision history, gives diachronic access to both page source code as well as accurately parsed HTML and supports the diachronic exploration of the page network. WikiDragon is self enclosed and only requires an XML dump of the official Wikimedia Foundation website for import into an embedded database. No additional setup is required. We describe WikiDragon’s architecture and evaluate the framework based on the simple English Wikipedia with respect to the accuracy of link extraction, diachronic network analysis and the impact of using different Wikipedia frameworks to text analysis.",{"paper_id":11296,"title":11297,"year":81,"month":855,"day":63,"doi":11298,"resource_url":11299,"first_page":63,"last_page":63,"pdf_url":11300,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11301,"paper_type":860,"authors":11302,"abstract":11316},"lrec2018-main-590","Studying Muslim Stereotyping through Microportrait Extraction","10.63317\u002F5nsfe93ciwmh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-590","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F989.pdf","fokkens-etal-2018-studying",[11303,11304,11307,11310,11313],{"paper_id":11296,"author_seq":247,"given_name":9531,"surname":9532,"affiliation":63,"orcid":63},{"paper_id":11296,"author_seq":232,"given_name":11305,"surname":11306,"affiliation":63,"orcid":63},"Nel","Ruigrok",{"paper_id":11296,"author_seq":218,"given_name":11308,"surname":11309,"affiliation":63,"orcid":63},"Camiel","Beukeboom",{"paper_id":11296,"author_seq":203,"given_name":11311,"surname":11312,"affiliation":63,"orcid":63},"Gagestein","Sarah",{"paper_id":11296,"author_seq":188,"given_name":11314,"surname":11315,"affiliation":63,"orcid":63},"Wouter","van Atteveldt","Research from communication science has shown that stereotypical ideas are often reflected in language use. Media coverage of different groups in society influences the perception people have about these groups and even increases distrust and polarization among different groups. Investigating the forms of (especially subtle) stereotyping can raise awareness to journalists and help prevent reinforcing oppositions between groups in society. Conducting large-scale, deep investigations to determine whether we are faced with stereotyping is time-consuming and costly. We propose to tackle this challenges through the means of microportraits: an impression of a target group or individual conveyed in a single text. We introduce the first system implementation for Dutch and show that microportraits allow social scientists to explore various dimensions of stereotyping. We explore the possibilities provided by microportraits by investigating stereotyping of Muslims in the Dutch media. Our (preliminary) results show that microportraits provide more detailed insights into stereotyping compared to more basic models such as word clouds.",{"paper_id":11318,"title":11319,"year":81,"month":855,"day":63,"doi":11320,"resource_url":11321,"first_page":63,"last_page":63,"pdf_url":11322,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11323,"paper_type":860,"authors":11324,"abstract":11337},"lrec2018-main-591","Analyzing the Quality of Counseling Conversations: the Tell-Tale Signs of High-quality Counseling","10.63317\u002F5m64ebk6fy4k","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-591","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1033.pdf","perez-rosas-etal-2018-analyzing",[11325,11328,11330,11332,11333,11336],{"paper_id":11318,"author_seq":247,"given_name":11326,"surname":11327,"affiliation":63,"orcid":63},"Verónica","Pérez-Rosas",{"paper_id":11318,"author_seq":232,"given_name":11329,"surname":3774,"affiliation":63,"orcid":63},"Xuetong",{"paper_id":11318,"author_seq":218,"given_name":11331,"surname":1591,"affiliation":63,"orcid":63},"Christy",{"paper_id":11318,"author_seq":203,"given_name":9624,"surname":1588,"affiliation":63,"orcid":63},{"paper_id":11318,"author_seq":188,"given_name":11334,"surname":11335,"affiliation":63,"orcid":63},"Kenneth","Resnicow",{"paper_id":11318,"author_seq":172,"given_name":9664,"surname":9665,"affiliation":63,"orcid":63},"Behavioral and mental health are pressing issues worldwide. Counseling is emerging as a core treatment for a variety of mental and behavioral health disorders. Seeking to improve the understanding of counseling practice, researchers have started to explore Natural Language Processing approaches to analyze the nature of counseling interactions by studying aspects such as mirroring, empathy, and reflective listening. A challenging aspect of this task is the lack of psychotherapy corpora. In this paper, we introduce a new dataset of high-quality and low-quality counseling conversations collected from public web sources. We present a detailed description of the dataset collection process, including preprocessing, transcription, and the annotation of two counseling micro-skills: reflective listening and questions. We show that the obtained dataset can be used to build text-based classifiers able to predict the overall quality of a counseling conversation and provide insights into the linguistic differences between low and high quality counseling.",{"paper_id":11339,"title":11340,"year":81,"month":855,"day":63,"doi":11341,"resource_url":11342,"first_page":63,"last_page":63,"pdf_url":11343,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11344,"paper_type":860,"authors":11345,"abstract":11352},"lrec2018-main-592","Interpersonal Relationship Labels for the CALLHOME Corpus","10.63317\u002F5novjubsr7wy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-592","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1081.pdf","katerenchuk-etal-2018-interpersonal",[11346,11349,11350],{"paper_id":11339,"author_seq":247,"given_name":11347,"surname":11348,"affiliation":63,"orcid":63},"Denys","Katerenchuk",{"paper_id":11339,"author_seq":232,"given_name":3431,"surname":3432,"affiliation":63,"orcid":63},{"paper_id":11339,"author_seq":218,"given_name":1731,"surname":11351,"affiliation":63,"orcid":63},"Rosenberg","The way we speak to our friends, colleagues, or partners is different in both the explicit context, what we say, and the implicit, how we say it. Understanding these differences is important because it provides additional information that can be used in natural language processing tasks. For example, knowing the relationship between interlocutors can help to narrow the range of topics and improve automatic speech recognition system results. Unfortunately, the lack of corpora makes exploration of this problem intractable. In this work, we release a set of interpersonal relationship labels between conversation participants for the CALLHOME English corpus. We make the labels freely available for download on our website and hope that this effort can further boost research in this area.",{"paper_id":11354,"title":11355,"year":81,"month":855,"day":63,"doi":11356,"resource_url":11357,"first_page":63,"last_page":63,"pdf_url":11358,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11359,"paper_type":860,"authors":11360,"abstract":11370},"lrec2018-main-593","Text Mining for History: first steps on building a large dataset","10.63317\u002F4whro8s4wh9j","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-593","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1084.pdf","higuchi-etal-2018-text",[11361,11364,11366,11368],{"paper_id":11354,"author_seq":247,"given_name":11362,"surname":11363,"affiliation":63,"orcid":63},"Suemi","Higuchi",{"paper_id":11354,"author_seq":232,"given_name":11365,"surname":4789,"affiliation":63,"orcid":63},"Cláudia",{"paper_id":11354,"author_seq":218,"given_name":2014,"surname":11367,"affiliation":63,"orcid":63},"Cuconato",{"paper_id":11354,"author_seq":203,"given_name":9012,"surname":11369,"affiliation":63,"orcid":63},"Rademaker","This paper presents the initial efforts towards the creation of a new corpus on the history domain. Motivated by the historians' need to interrogate a vast material - almost 12 million words and more than three hundred thousand sentences - in a non-linear way, our approach privileges deep linguistic analysis on an encyclopedic-style data. In this context, the work presented here focuses on the preparation of the corpus, which is prior to the mining activity: the morphosyntactic annotation and the definition of semantic types for entities and relations relevant to the History domain. Taking advantage of the semantic nature of appositive constructions, we manually analyzed a sample of eleven hundred sentences in order to verify its potential as additional semantic clues to be considered. The results show that we are on the right track.",{"paper_id":11372,"title":11373,"year":81,"month":855,"day":63,"doi":11374,"resource_url":11375,"first_page":63,"last_page":63,"pdf_url":11376,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11377,"paper_type":860,"authors":11378,"abstract":11389},"lrec2018-main-594","Building Evaluation Datasets for Cultural Microblog Retrieval","10.63317\u002F3vs33o5kgq4e","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-594","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1115.pdf","goeuriot-etal-2018-building",[11379,11382,11385,11387],{"paper_id":11372,"author_seq":247,"given_name":11380,"surname":11381,"affiliation":63,"orcid":63},"Lorraine","Goeuriot",{"paper_id":11372,"author_seq":232,"given_name":11383,"surname":11384,"affiliation":63,"orcid":63},"Josiane","Mothe",{"paper_id":11372,"author_seq":218,"given_name":2089,"surname":11386,"affiliation":63,"orcid":63},"Mulhem",{"paper_id":11372,"author_seq":203,"given_name":5665,"surname":11388,"affiliation":63,"orcid":63},"SanJuan","ECLEF Microblog Cultural Contextualization is an evaluation challenge aiming at providing the research community with datasets to gather, organize and deliver relevant social data related to events generating large number of microblogs and web documents. The evaluation challenges runs every year since 2016. We describe in this paper the resources built for the challenge, that can be used outside of the context of the challenge.",{"paper_id":11391,"title":11392,"year":81,"month":855,"day":63,"doi":11393,"resource_url":11394,"first_page":63,"last_page":63,"pdf_url":11395,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11396,"paper_type":860,"authors":11397,"abstract":11404},"lrec2018-main-595","Training and Adapting Multilingual NMT for Less-resourced and Morphologically Rich Languages","10.63317\u002F2xb64bmrbc6y","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-595","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F75.pdf","rikters-etal-2018-training",[11398,11401,11402],{"paper_id":11391,"author_seq":247,"given_name":11399,"surname":11400,"affiliation":63,"orcid":63},"Matīss","Rikters",{"paper_id":11391,"author_seq":232,"given_name":4839,"surname":4840,"affiliation":63,"orcid":63},{"paper_id":11391,"author_seq":218,"given_name":4843,"surname":11403,"affiliation":63,"orcid":63},"Krišlauks","In this paper, we present results of employing multilingual and multi-way neural machine translation approaches for morphologically rich languages, such as Estonian and Russian. We experiment with different NMT architectures that allow achieving state-of-the-art translation quality and compare the multi-way model performance to one-way model performance. We report improvements of up to +3.27 BLEU points over our baseline results, when using a multi-way model trained using the transformer network architecture. We also provide open-source scripts used for shuffling and combining multiple parallel datasets for training of the multilingual systems.",{"paper_id":11406,"title":11407,"year":81,"month":855,"day":63,"doi":11408,"resource_url":11409,"first_page":63,"last_page":63,"pdf_url":11410,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11411,"paper_type":860,"authors":11412,"abstract":11419},"lrec2018-main-596","Cross-lingual Terminology Extraction for Translation Quality Estimation","10.63317\u002F59j8rm62som5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-596","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F245.pdf","yuan-etal-2018-cross",[11413,11414,11416,11418],{"paper_id":11406,"author_seq":247,"given_name":2406,"surname":6602,"affiliation":63,"orcid":63},{"paper_id":11406,"author_seq":232,"given_name":11415,"surname":9966,"affiliation":63,"orcid":63},"Yuze",{"paper_id":11406,"author_seq":218,"given_name":11417,"surname":2381,"affiliation":63,"orcid":63},"Yue",{"paper_id":11406,"author_seq":203,"given_name":3443,"surname":3444,"affiliation":63,"orcid":63},"We explore ways of identifying terms from monolingual texts and integrate them into investigating the contribution of terminology to translation quality.The researchers proposed a supervised learning method using common statistical measures for termhood and unithood as features to train classifiers for identifying terms in cross-domain and cross-language settings. On its basis, sequences of words from source texts (STs) and target texts (TTs) are aligned naively through a fuzzy matching mechanism for identifying the correctly translated term equivalents in student translations. Correlation analyses further show that normalized term occurrences in translations have weak linear relationship with translation quality in term of usefulness\u002Ftransfer, terminology\u002Fstyle, idiomatic writing and target mechanics and near- and above-strong relationship with the overall translation quality. This method has demonstrated some reliability in automatically identifying terms in human translations. However, drawbacks in handling low frequency terms and term variations shall be dealt in the future.",{"paper_id":11421,"title":11422,"year":81,"month":855,"day":63,"doi":11423,"resource_url":11424,"first_page":63,"last_page":63,"pdf_url":11425,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11426,"paper_type":860,"authors":11427,"abstract":11438},"lrec2018-main-597","Machine Translation of Low-Resource Spoken Dialects: Strategies for Normalizing Swiss German","10.63317\u002F44edz9peioav","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-597","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F302.pdf","honnet-etal-2018-machine",[11428,11431,11433,11436],{"paper_id":11421,"author_seq":247,"given_name":11429,"surname":11430,"affiliation":63,"orcid":63},"Pierre-Edouard","Honnet",{"paper_id":11421,"author_seq":232,"given_name":2907,"surname":11432,"affiliation":63,"orcid":63},"Popescu-Belis",{"paper_id":11421,"author_seq":218,"given_name":11434,"surname":11435,"affiliation":63,"orcid":63},"Claudiu","Musat",{"paper_id":11421,"author_seq":203,"given_name":1780,"surname":11437,"affiliation":63,"orcid":63},"Baeriswyl","The goal of this work is to design a machine translation (MT) system for a low-resource family of dialects, collectively known as Swiss German, which are widely spoken in Switzerland but seldom written.  We collected a significant number of parallel written resources to start with, up to a total of about 60k words.        Moreover, we identified several other promising data sources for Swiss German.  Then, we designed and compared three strategies for normalizing Swiss German input in order to address the regional diversity.  We found that character-based neural MT was the best solution for text normalization.  In combination with phrase-based statistical MT, our solution reached 36% BLEU score when translating from the Bernese dialect.  This value, however, decreases as the testing data becomes more remote from the training one, geographically and topically.  These resources and normalization techniques are a first step towards full MT of Swiss German dialects.",{"paper_id":11440,"title":11441,"year":81,"month":855,"day":63,"doi":11442,"resource_url":11443,"first_page":63,"last_page":63,"pdf_url":11444,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11445,"paper_type":860,"authors":11446,"abstract":11459},"lrec2018-main-598","Improving domain-specific SMT for low-resourced languages using data from different domains","10.63317\u002F4mheb9it5fvo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-598","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F585.pdf","farhath-etal-2018-improving",[11447,11450,11453,11454,11457],{"paper_id":11440,"author_seq":247,"given_name":11448,"surname":11449,"affiliation":63,"orcid":63},"Fathima","Farhath",{"paper_id":11440,"author_seq":232,"given_name":11451,"surname":11452,"affiliation":63,"orcid":63},"Pranavan","Theivendiram",{"paper_id":11440,"author_seq":218,"given_name":5697,"surname":5698,"affiliation":63,"orcid":63},{"paper_id":11440,"author_seq":203,"given_name":11455,"surname":11456,"affiliation":63,"orcid":63},"Sanath","Jayasena",{"paper_id":11440,"author_seq":188,"given_name":11458,"surname":4267,"affiliation":63,"orcid":63},"Gihan","This paper evaluates the impact of different types of data sources in developing a domain-specific statistical machine translation (SMT) system for the domain of official government letters, between the low-resourced language pair Sinhala and Tamil. The baseline was built with a small in-domain parallel data set containing official government letters. The translation system was evaluated with two different test datasets. Test data from the same sources as training and tuning gave a higher score due to over-fitting, while the test data from a different source resulted in a considerably lower score.  With the motive to improve translation, more data was collected from, (a) different government sources other than official letters (pseudo in-domain), and (b) online sources such as blogs, news and wiki dumps (out-domain). Use of pseudo in-domain data showed an improvement for both the test sets as the language is formal and context was similar to that of the in-domain though the writing style varies. Out-domain data, however, did not give a positive impact, either in filtered or unfiltered forms, as the writing style was different and the context was much more general than that of the official government documents.",{"paper_id":11461,"title":11462,"year":81,"month":855,"day":63,"doi":11463,"resource_url":11464,"first_page":63,"last_page":63,"pdf_url":11465,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11466,"paper_type":860,"authors":11467,"abstract":11475},"lrec2018-main-599","Discovering Parallel Language Resources for Training MT Engines","10.63317\u002F3bke5nrob6mo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-599","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F604.pdf","papavassiliou-etal-2018-discovering",[11468,11471,11474],{"paper_id":11461,"author_seq":247,"given_name":11469,"surname":11470,"affiliation":63,"orcid":63},"Vassilis","Papavassiliou",{"paper_id":11461,"author_seq":232,"given_name":11472,"surname":11473,"affiliation":63,"orcid":63},"Prokopis","Prokopidis",{"paper_id":11461,"author_seq":218,"given_name":4632,"surname":4633,"affiliation":63,"orcid":63},"Web crawling is an efficient way for compiling the monolingual, parallel and\u002For domain-specific corpora needed for machine translation and other HLT applications. These corpora can be automatically processed to generate second order or synthesized derivative resources, including bilingual (general or domain-specific) lexica and terminology lists. In this submission, we discuss the architecture and use of the ILSP Focused Crawler (ILSP-FC), a system developed by researchers of the ILSP\u002FAthena RIC for the acquisition of such resources, and currently being used through the European Language Resource Coordination effort. ELRC aims to identify and gather language and translation data relevant to public services and governmental institutions across 30 European countries participating in the Connecting Europe Facility (CEF).",{"paper_id":11477,"title":11478,"year":81,"month":855,"day":63,"doi":11479,"resource_url":11480,"first_page":63,"last_page":63,"pdf_url":11481,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11482,"paper_type":860,"authors":11483,"abstract":11492},"lrec2018-main-600","A fine-grained error analysis of NMT, SMT and RBMT output for English-to-Dutch","10.63317\u002F3nu9ff32gr8f","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-600","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F611.pdf","van-brussel-etal-2018-fine",[11484,11486,11489],{"paper_id":11477,"author_seq":247,"given_name":1172,"surname":11485,"affiliation":63,"orcid":63},"Van Brussel",{"paper_id":11477,"author_seq":232,"given_name":11487,"surname":11488,"affiliation":63,"orcid":63},"Arda","Tezcan",{"paper_id":11477,"author_seq":218,"given_name":11490,"surname":11491,"affiliation":63,"orcid":63},"Lieve","Macken","This paper presents a fine-grained error comparison of the English-to-Dutch translations of a commercial neural, phrase-based and rule-based machine translation (MT) system. For phrase-based and rule-based machine translation, we make use of the annotated SCATE corpus of MT errors, enriching it with the annotation of neural MT errors and updating the SCATE error taxonomy to fit the neural MT output as well. Neural, in general, outperforms phrase-based and rule-based systems especially for fluency, except for lexical issues. On the accuracy level, the improvements are less obvious. The target sentence does not always contain traces or clues of content being missing (omissions). This has repercussions for quality estimation or gisting operating only on the monolingual level. Mistranslations are part of another well represented error category, comprising a high number of word-sense disambiguation errors and a variety of other mistranslation errors, making it more complex to annotate or post-edit.",{"paper_id":11494,"title":11495,"year":81,"month":855,"day":63,"doi":11496,"resource_url":11497,"first_page":63,"last_page":63,"pdf_url":11498,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11499,"paper_type":860,"authors":11500,"abstract":11509},"lrec2018-main-601","Collection and Analysis of Code-switch Egyptian Arabic-English Speech Corpus","10.63317\u002F2fi2pfcqkpmk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-601","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1046.pdf","hamed-etal-2018-collection",[11501,11504,11506],{"paper_id":11494,"author_seq":247,"given_name":11502,"surname":11503,"affiliation":63,"orcid":63},"Injy","Hamed",{"paper_id":11494,"author_seq":232,"given_name":1157,"surname":11505,"affiliation":63,"orcid":63},"Elmahdy",{"paper_id":11494,"author_seq":218,"given_name":11507,"surname":11508,"affiliation":63,"orcid":63},"Slim","Abdennadher","Speech corpora are key components needed by both: linguists (in language analyses, research and teaching languages) and Natural Language Processing (NLP) researchers (in training and evaluating several NLP tasks such as speech recognition, text-to-speech and speech-to-text synthesis). Despite of the great demand, there is still a huge shortage in available corpora, especially in the case of dialectal languages, and code-switched speech. In this paper, we present our efforts in collecting and analyzing a speech corpus for conversational Egyptian Arabic. As in other multilingual societies, it is common among Egyptians to use a mix of Arabic and English in daily conversations. The act of switching languages, at sentence boundaries or within the same sentence, is referred to as code-switching. The aim of this work is a three-fold: (1) gather conversational Egyptian Arabic spontaneous speech, (2) obtain manual transcriptions and (3) analyze the speech from the code-switching perspective. A subset of the transcriptions were manually annotated for part-of-speech (POS) tags. The POS distribution of the embedded words was analyzed as well as the POS distribution for the trigger words (Arabic words preceding a code-switching point). The speech corpus can be obtained by contacting the authors.",{"paper_id":11511,"title":11512,"year":81,"month":855,"day":63,"doi":11513,"resource_url":11514,"first_page":63,"last_page":63,"pdf_url":11515,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11516,"paper_type":860,"authors":11517,"abstract":11522},"lrec2018-main-602","Multimodal Lexical Translation","10.63317\u002F376c44dt48af","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-602","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F629.pdf","lala-specia-2018-multimodal",[11518,11521],{"paper_id":11511,"author_seq":247,"given_name":11519,"surname":11520,"affiliation":63,"orcid":63},"Chiraag","Lala",{"paper_id":11511,"author_seq":232,"given_name":10718,"surname":10719,"affiliation":63,"orcid":63},"Inspired by the tasks of Multimodal Machine Translation and Visual Sense Disambiguation we introduce a task called Multimodal Lexical Translation (MLT). The aim of this new task is to correctly translate an ambiguous word given its context - an image and a sentence in the source language. To facilitate the task, we introduce the MLT dataset, where each data point is a 4-tuple consisting of an ambiguous source word, its visual context (an image), its textual context (a source sentence), and its translation that conforms with the visual and textual contexts. The dataset has been created from the Multi30K corpus using word-alignment followed by human inspection for translations from English to German and English to French. We also introduce a simple heuristic to quantify the extent of the ambiguity of a word from the distribution of its translations and use it to select subsets of the MLT Dataset which are difficult to translate. These form a valuable multimodal and multilingual language resource with several potential uses including evaluation of lexical disambiguation within (Multimodal) Machine Translation systems.",{"paper_id":11524,"title":11525,"year":81,"month":855,"day":63,"doi":11526,"resource_url":11527,"first_page":63,"last_page":63,"pdf_url":11528,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11529,"paper_type":860,"authors":11530,"abstract":11538},"lrec2018-main-603","Literality and cognitive effort: Japanese and Spanish","10.63317\u002F2hpbuxdadf83","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-603","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F811.pdf","lacruz-etal-2018-literality",[11531,11534,11535],{"paper_id":11524,"author_seq":247,"given_name":11532,"surname":11533,"affiliation":63,"orcid":63},"Isabel","Lacruz",{"paper_id":11524,"author_seq":232,"given_name":1780,"surname":4210,"affiliation":63,"orcid":63},{"paper_id":11524,"author_seq":218,"given_name":11536,"surname":11537,"affiliation":63,"orcid":63},"Masaru","Yamada","We introduce a notion of pause-word ratio computed using ranges of pause lengths rather than lower cutoffs for pause lengths. Standard pause-word ratios are indicators of cognitive effort during different translation modalities.The pause range version allows for the study of how different types of pauses relate to the extent of cognitive effort and where it occurs in the translation process.  In this article we focus on short monitoring pauses and how they relate to the cognitive effort involved in translation and post-editing for language pairs that are different in terms of semantic and syntactic remoteness. We use data from the CRITT TPR database, comparing translation and post-editing from English to Japanese and from English to Spanish, and study the interaction of pause-word ratio for short pauses ranging between 300 and 500ms with syntactic remoteness, measured by the CrossS feature, semantic remoteness, measured by HTra, and syntactic and semantic remoteness, measured by Literality.",{"paper_id":11540,"title":11541,"year":81,"month":855,"day":63,"doi":11542,"resource_url":11543,"first_page":63,"last_page":63,"pdf_url":11544,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11545,"paper_type":860,"authors":11546,"abstract":11552},"lrec2018-main-604","Evaluation of Machine Translation Performance Across Multiple Genres and Languages","10.63317\u002F5imihaq873kn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-604","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F853.pdf","van-der-wees-etal-2018-evaluation",[11547,11550,11551],{"paper_id":11540,"author_seq":247,"given_name":11548,"surname":11549,"affiliation":63,"orcid":63},"Marlies","van der Wees",{"paper_id":11540,"author_seq":232,"given_name":3666,"surname":3667,"affiliation":63,"orcid":63},{"paper_id":11540,"author_seq":218,"given_name":3669,"surname":3670,"affiliation":63,"orcid":63},"In this paper, we present evaluation corpora covering four genres for four language pairs that we harvested from the web in an automated fashion. We use these multi-genre benchmarks to evaluate the impact of genre differences on machine translation (MT).  We observe that BLEU score differences between genres can be large and that, for all genres and all language pairs, translation quality improves when using four genre-optimized systems rather than a single genre-agnostic system. Finally, we train and use genre classifiers to route test documents to the most appropriate genre systems. The results of these experiments show that our multi-genre benchmarks can serve to advance research on text genre adaptation for MT.",{"paper_id":11554,"title":11555,"year":81,"month":855,"day":63,"doi":11556,"resource_url":11557,"first_page":63,"last_page":63,"pdf_url":11558,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11559,"paper_type":860,"authors":11560,"abstract":11566},"lrec2018-main-605","A Multilingual Dataset for Evaluating Parallel Sentence Extraction from Comparable Corpora","10.63317\u002F25ymczomp65x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-605","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F955.pdf","zweigenbaum-etal-2018-multilingual",[11561,11562,11563],{"paper_id":11554,"author_seq":247,"given_name":1388,"surname":1389,"affiliation":63,"orcid":63},{"paper_id":11554,"author_seq":232,"given_name":3443,"surname":3444,"affiliation":63,"orcid":63},{"paper_id":11554,"author_seq":218,"given_name":11564,"surname":11565,"affiliation":63,"orcid":63},"Reinhard","Rapp","Comparable corpora can be seen as a reservoir for parallel sentences and phrases to overcome limitations in variety and quantity encountered in existing parallel corpora.  This has motivated the design of methods to extract parallel sentences from comparable corporad.  Despite this interest and work, no shared dataset has been made available for this task until the 2017 BUCC Shared Task.  We present the challenges faced to build such a dataset and the solutions adopted to design and create the 2017 BUCC Shared Task dataset, emphasizing issues we had to cope with to include Chinese as one of the languages. The resulting corpus contains a total of about 3.5 million distinct sentences in English, French, German, Russian, and Chinese, mostly from Wikipedia.  We illustrate the use of this dataset in the shared task and summarize the main results obtained by its participants.  We finally outline remaining issues.",{"paper_id":11568,"title":11569,"year":81,"month":855,"day":63,"doi":11570,"resource_url":11571,"first_page":63,"last_page":63,"pdf_url":11572,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11573,"paper_type":860,"authors":11574,"abstract":11584},"lrec2018-main-606","Manual vs Automatic Bitext Extraction","10.63317\u002F3e7ocg3pz9e6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-606","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1097.pdf","makazhanov-etal-2018-manual",[11575,11578,11581],{"paper_id":11568,"author_seq":247,"given_name":11576,"surname":11577,"affiliation":63,"orcid":63},"Aibek","Makazhanov",{"paper_id":11568,"author_seq":232,"given_name":11579,"surname":11580,"affiliation":63,"orcid":63},"Bagdat","Myrzakhmetov",{"paper_id":11568,"author_seq":218,"given_name":11582,"surname":11583,"affiliation":63,"orcid":63},"Zhenisbek","Assylbekov","We compare manual and automatic approaches to the problem of extracting bitexts from the Web in the framework of a case study on building a Russian-Kazakh parallel corpus. Our findings suggest that targeted, site-specific crawling results in cleaner bitexts with a higher ratio of parallel sentences. We also find that general crawlers combined with boilerplate removal tools tend to retrieve shorter texts, as some content gets cleaned out with the markup. When it comes to sentence splitting and alignment we show that investing some effort in data pre- and post-processing as well as fiddling with off-the-shelf solutions pays a noticeable dividend. Overall we observe that, depending on the source, automatic bitext extraction methods may lack severely in coverage (retrieve fewer sentence pairs) and on average are less precise (retrieve fewer parallel sentence pairs). We conclude that if one aims at extracting high-quality bitexts for a small number of language pairs, automatic methods best be avoided, or at least used with caution.",{"paper_id":11586,"title":11587,"year":81,"month":855,"day":63,"doi":11588,"resource_url":11589,"first_page":63,"last_page":63,"pdf_url":11590,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11591,"paper_type":860,"authors":11592,"abstract":11601},"lrec2018-main-607","A Morphologically Annotated Corpus of Emirati Arabic","10.63317\u002F4ujvhk7xskc6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-607","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F529.pdf","khalifa-etal-2018-morphologically",[11593,11594,11595,11596,11597,11598],{"paper_id":11586,"author_seq":247,"given_name":8340,"surname":8341,"affiliation":63,"orcid":63},{"paper_id":11586,"author_seq":232,"given_name":3646,"surname":3647,"affiliation":63,"orcid":63},{"paper_id":11586,"author_seq":218,"given_name":10438,"surname":10439,"affiliation":63,"orcid":63},{"paper_id":11586,"author_seq":203,"given_name":8337,"surname":8338,"affiliation":63,"orcid":63},{"paper_id":11586,"author_seq":188,"given_name":10433,"surname":10434,"affiliation":63,"orcid":63},{"paper_id":11586,"author_seq":172,"given_name":11599,"surname":11600,"affiliation":63,"orcid":63},"Meera","Al Kaabi","We present an ongoing effort on the first large-scale morphologically manually annotated corpus of Emirati Arabic. This corpus includes about 200,000 words selected from eight Gumar corpus novels from the Emirati Arabic variety. The selected texts are being annotated for tokenization, part-of-speech, lemmatization, English glosses and dialect identification. The orthography of the text is also adjusted for errors and inconsistencies. We discuss the guidelines for each part of the annotation components and the annotation interface we use. We report on the quality of the annotation through an inter annotator agreement measure.",{"paper_id":11603,"title":11604,"year":81,"month":855,"day":63,"doi":11605,"resource_url":11606,"first_page":63,"last_page":63,"pdf_url":11607,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11608,"paper_type":860,"authors":11609,"abstract":11627},"lrec2018-main-608","CoNLL-UL: Universal Morphological Lattices for Universal Dependency Parsing","10.63317\u002F3n4qqu4jzph4","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-608","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F625.pdf","more-etal-2018-conll",[11610,11612,11615,11618,11619,11620,11623,11624],{"paper_id":11603,"author_seq":247,"given_name":1797,"surname":11611,"affiliation":63,"orcid":63},"More",{"paper_id":11603,"author_seq":232,"given_name":11613,"surname":11614,"affiliation":63,"orcid":63},"Özlem","Çetinoğlu",{"paper_id":11603,"author_seq":218,"given_name":11616,"surname":11617,"affiliation":63,"orcid":63},"Çağrı","Çöltekin",{"paper_id":11603,"author_seq":203,"given_name":3646,"surname":3647,"affiliation":63,"orcid":63},{"paper_id":11603,"author_seq":188,"given_name":6238,"surname":6239,"affiliation":63,"orcid":63},{"paper_id":11603,"author_seq":172,"given_name":11621,"surname":11622,"affiliation":63,"orcid":63},"Djamé","Seddah",{"paper_id":11603,"author_seq":155,"given_name":6112,"surname":7143,"affiliation":63,"orcid":63},{"paper_id":11603,"author_seq":138,"given_name":11625,"surname":11626,"affiliation":63,"orcid":63},"Reut","Tsarfaty","Following the development of the universal dependencies (UD) framework and the CoNLL 2017 Shared Task on end-to-end UD parsing, we address the need for a universal representation of morphological analysis which on the one hand can capture a range of different alternative morphological analyses of surface tokens, and on the other hand is compatible with the segmentation and morphological annotation guidelines prescribed for UD treebanks. We propose the CoNLL universal lattices (CoNLL-UL) format, a new annotation format for word lattices that represent morphological analyses, and provide resources that obey this format for a range of typologically different languages. The resources we provide are harmonized with the two-level representation and morphological annotation in their respective UD v2 treebanks, thus enabling research on universal models for morphological and syntactic parsing, in both pipeline and joint settings, and presenting new opportunities in the development of UD resources for low-resource languages.",{"paper_id":11629,"title":11630,"year":81,"month":855,"day":63,"doi":11631,"resource_url":11632,"first_page":63,"last_page":63,"pdf_url":11633,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11634,"paper_type":860,"authors":11635,"abstract":11640},"lrec2018-main-609","Manually Annotated Corpus of Polish Texts Published between 1830 and 1918","10.63317\u002F45z36wrvg3z9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-609","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F675.pdf","kieras-wolinski-2018-manually",[11636,11639],{"paper_id":11629,"author_seq":247,"given_name":11637,"surname":11638,"affiliation":63,"orcid":63},"Witold","Kieraś",{"paper_id":11629,"author_seq":232,"given_name":5535,"surname":6185,"affiliation":63,"orcid":63},"The paper presents a manually annotated 625,000 tokens large historical corpus of -- fiction, drama, popular science, essays and newspapers of the period. The corpus provides three layers: transliteration, transcription and morphosyntactic annotation. The annotation process as well as the corpus itself are described in detail in the paper.",{"paper_id":11642,"title":11643,"year":81,"month":855,"day":63,"doi":11644,"resource_url":11645,"first_page":63,"last_page":63,"pdf_url":11646,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11647,"paper_type":860,"authors":11648,"abstract":11665},"lrec2018-main-610","Evaluating Inflectional Complexity Crosslinguistically: a Processing Perspective","10.63317\u002F455qpw3quj7f","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-610","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F745.pdf","marzi-etal-2018-evaluating",[11649,11651,11654,11657,11659,11662],{"paper_id":11642,"author_seq":247,"given_name":10215,"surname":11650,"affiliation":63,"orcid":63},"Marzi",{"paper_id":11642,"author_seq":232,"given_name":11652,"surname":11653,"affiliation":63,"orcid":63},"Marcello","Ferro",{"paper_id":11642,"author_seq":218,"given_name":11655,"surname":11656,"affiliation":63,"orcid":63},"Ouafae","Nahli",{"paper_id":11642,"author_seq":203,"given_name":2999,"surname":11658,"affiliation":63,"orcid":63},"Belik",{"paper_id":11642,"author_seq":188,"given_name":11660,"surname":11661,"affiliation":63,"orcid":63},"Stavros","Bompolas",{"paper_id":11642,"author_seq":172,"given_name":11663,"surname":11664,"affiliation":63,"orcid":63},"Vito","Pirrelli","The paper provides a cognitively motivated method for evaluating the inflectional complexity of a language, based on a sample of \"raw\" inflected word forms processed and learned by a recurrent self-organising neural network with fixed parameter setting. Training items contain no information about either morphological content or structure. This makes the proposed method independent of both meta-linguistic issues (e.g. format and expressive power of descriptive rules, manual or automated segmentation of input forms, number of inflectional classes etc.) and language-specific typological aspects (e.g. word-based, stem-based or template-based morphology). Results are illustrated by contrasting Arabic, English, German, Greek, Italian and Spanish.",{"paper_id":11667,"title":11668,"year":81,"month":855,"day":63,"doi":11669,"resource_url":11670,"first_page":63,"last_page":63,"pdf_url":11671,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11672,"paper_type":860,"authors":11673,"abstract":11690},"lrec2018-main-611","Parser combinators for Tigrinya and Oromo morphology","10.63317\u002F268voan22369","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-611","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F798.pdf","littell-etal-2018-parser",[11674,11675,11677,11678,11681,11684,11685,11688],{"paper_id":11667,"author_seq":247,"given_name":2451,"surname":8594,"affiliation":63,"orcid":63},{"paper_id":11667,"author_seq":232,"given_name":2561,"surname":11676,"affiliation":63,"orcid":63},"McCoy",{"paper_id":11667,"author_seq":218,"given_name":7171,"surname":2403,"affiliation":63,"orcid":63},{"paper_id":11667,"author_seq":203,"given_name":11679,"surname":11680,"affiliation":63,"orcid":63},"Shruti","Rijhwani",{"paper_id":11667,"author_seq":188,"given_name":11682,"surname":11683,"affiliation":63,"orcid":63},"Zaid","Sheikh",{"paper_id":11667,"author_seq":172,"given_name":1199,"surname":8589,"affiliation":63,"orcid":63},{"paper_id":11667,"author_seq":155,"given_name":11686,"surname":11687,"affiliation":63,"orcid":63},"Teruko","Mitamura",{"paper_id":11667,"author_seq":138,"given_name":5173,"surname":11689,"affiliation":63,"orcid":63},"Levin","We present rule-based morphological parsers in the Tigrinya and Oromo languages, based on a parser-combinator rather than finite-state paradigm.  This paradigm allows rapid development and ease of integration with other systems, although at the cost of non-optimal theoretical efficiency.  These parsers produce multiple output representations simultaneously, including lemmatization, morphological segmentation, and an English word-for-word gloss, and we evaluate these representations as input for entity detection and linking and humanitarian need detection.",{"paper_id":11692,"title":11693,"year":81,"month":855,"day":63,"doi":11694,"resource_url":11695,"first_page":63,"last_page":63,"pdf_url":11696,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11697,"paper_type":860,"authors":11698,"abstract":11701},"lrec2018-main-612","Massively Translingual Compound Analysis and Translation Discovery","10.63317\u002F3a446cs6jxje","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-612","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1029.pdf","wu-yarowsky-2018-massively",[11699,11700],{"paper_id":11692,"author_seq":247,"given_name":3702,"surname":3703,"affiliation":63,"orcid":63},{"paper_id":11692,"author_seq":232,"given_name":1199,"surname":3705,"affiliation":63,"orcid":63},"Word formation via compounding is a very widely observed but quite diverse phenomenon across the world's languages, but the compositional semantics of a compound are often productively correlated between even distant languages. Using only freely available bilingual dictionaries and no annotated training data, we derive novel models for analyzing compound words and effectively generate novel foreign-language translations of English concepts using these models. In addition, we release a massively multilingual dataset of compound words along with their decompositions, covering over 21,000 instances in 329 languages, a previously unprecedented scale which should both productively support machine translation (especially in low resource languages) and also facilitate researchers in their further analysis and modeling of compounds and compound processes across the world's languages.",{"paper_id":11703,"title":11704,"year":81,"month":855,"day":63,"doi":11705,"resource_url":11706,"first_page":63,"last_page":63,"pdf_url":11707,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11708,"paper_type":860,"authors":11709,"abstract":11712},"lrec2018-main-613","Building a Morphological Treebank for German from a Linguistic Database","10.63317\u002F39u38a8k7qg2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-613","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1077.pdf","steiner-ruppenhofer-2018-building",[11710,11711],{"paper_id":11703,"author_seq":247,"given_name":7585,"surname":9807,"affiliation":63,"orcid":63},{"paper_id":11703,"author_seq":232,"given_name":2804,"surname":2805,"affiliation":63,"orcid":63},"German is a language with complex morphological processes. Its long and often ambiguous word forms present a bottleneck problem in natural language processing. As a step towards morphological analyses of high quality, this paper introduces a morphological treebank for German. It is derived from the linguistic database CELEX which is a standard resource for German morphology. We build on its refurbished, modernized and partially revised version. The derivation of the morphological trees is not trivial, especially for such cases of conversions which are morpho-semantically opaque and merely of diachronic interest. We develop solutions and present exemplary analyses. The resulting database comprises about 40,000 morphological trees of a German base vocabulary whose format and grade of detail can be chosen according to the requirements of the applications. The Perl scripts for the generation of the treebank are publicly available on github. In our discussion, we show some future directions for morphological treebanks. In particular, we aim at the combination with other reliable lexical resources such as GermaNet.",{"paper_id":11714,"title":11715,"year":81,"month":855,"day":63,"doi":11716,"resource_url":11717,"first_page":63,"last_page":63,"pdf_url":11718,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11719,"paper_type":860,"authors":11720,"abstract":11726},"lrec2018-main-614","Baselines and Test Data for Cross-Lingual Inference","10.63317\u002F2uttkx7m22xy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-614","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F52.pdf","agic-schluter-2018-baselines",[11721,11724],{"paper_id":11714,"author_seq":247,"given_name":11722,"surname":11723,"affiliation":63,"orcid":63},"Željko","Agić",{"paper_id":11714,"author_seq":232,"given_name":5342,"surname":11725,"affiliation":63,"orcid":63},"Schluter","The recent years have seen a revival of interest in textual entailment, sparked by i) the emergence of powerful deep neural network learners for natural language processing and ii) the timely development of large-scale evaluation datasets such as SNLI. Recast as natural language inference, the problem now amounts to detecting the relation between pairs of statements: they either contradict or entail one another, or they are mutually neutral. Current research in natural language inference is effectively exclusive to English. In this paper, we propose to advance the research in SNLI-style natural language inference toward multilingual evaluation. To that end, we provide test data for four major languages: Arabic, French, Spanish, and Russian. We experiment with a set of baselines. Our systems are based on cross-lingual word embeddings and machine translation. While our best system scores an average accuracy of just over 75%, we focus largely on enabling further research in multilingual inference.",{"paper_id":11728,"title":11729,"year":81,"month":855,"day":63,"doi":11730,"resource_url":11731,"first_page":63,"last_page":63,"pdf_url":11732,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11733,"paper_type":860,"authors":11734,"abstract":11742},"lrec2018-main-615","CATS: A Tool for Customized Alignment of Text Simplification Corpora","10.63317\u002F32gxagvexvew","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-615","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F630.pdf","stajner-etal-2018-cats",[11735,11736,11738,11741],{"paper_id":11728,"author_seq":247,"given_name":9446,"surname":9447,"affiliation":63,"orcid":63},{"paper_id":11728,"author_seq":232,"given_name":4664,"surname":11737,"affiliation":63,"orcid":63},"Franco-Salvador",{"paper_id":11728,"author_seq":218,"given_name":11739,"surname":11740,"affiliation":63,"orcid":63},"Paolo","Rosso",{"paper_id":11728,"author_seq":203,"given_name":2741,"surname":2742,"affiliation":63,"orcid":63},"In text simplification (TS), parallel corpora consisting of original sentences and their manually simplified counterparts are very scarce and small in size, which impedes building supervised automated TS systems with sufficient coverage. Furthermore, the existing corpora usually do not distinguish sentence pairs which present full matches (both sentences contain the same information), and those that present only partial matches (the two sentences share the meaning only partially), thus not allowing for building customized automated TS systems which would separately model different simplification transformations. In this paper, we present our freely available, language-independent tool for sentence alignment from parallel\u002Fcomparable TS resources (document-aligned resources), which additionally offers the possibility for filtering sentences depending on the level of their semantic overlap. We perform in-depth human evaluation of the tool's performance on English and Spanish corpora, and explore its capacities for classification of sentence pairs according to the simplification operation they model.",{"paper_id":11744,"title":11745,"year":81,"month":855,"day":63,"doi":11746,"resource_url":11747,"first_page":63,"last_page":63,"pdf_url":11748,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11749,"paper_type":860,"authors":11750,"abstract":11760},"lrec2018-main-616","KIT-Multi: A Translation-Oriented Multilingual Embedding Corpus","10.63317\u002F49xbed43gcqn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-616","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F688.pdf","ha-etal-2018-kit",[11751,11753,11754,11756,11759],{"paper_id":11744,"author_seq":247,"given_name":11752,"surname":5554,"affiliation":63,"orcid":63},"Thanh-Le",{"paper_id":11744,"author_seq":232,"given_name":2633,"surname":6709,"affiliation":63,"orcid":63},{"paper_id":11744,"author_seq":218,"given_name":1244,"surname":11755,"affiliation":63,"orcid":63},"Sperber",{"paper_id":11744,"author_seq":203,"given_name":11757,"surname":11758,"affiliation":63,"orcid":63},"Ngoc Quan","Pham",{"paper_id":11744,"author_seq":188,"given_name":2736,"surname":6711,"affiliation":63,"orcid":63},"Cross-lingual word embeddings are the representations of words across languages in a shared continuous vector space. Cross-lingual word embeddings have been shown to be helpful in the development of cross-lingual natural language processing tools. In case of more than two languages involved, we call them multilingual word embeddings. In this work, we introduce a multilingual word embedding corpus which is acquired by using neural machine translation.  Unlike other cross-lingual embedding corpora, the embeddings can be learned from significantly smaller portions of data and for multiple languages at once. An intrinsic evaluation on monolingual tasks shows that our method is fairly competitive to the prevalent methods but on the cross-lingual document classification task, it obtains the best figures.  Furthermore, the corpus is being analyzed regarding its usage and usefulness in other cross-lingual tasks. \\\\ \\newline \\Keywords{multilingual embeddings, cross-lingual embeddings, neural machine translation, multi-source translation} }",{"paper_id":11762,"title":11763,"year":81,"month":855,"day":63,"doi":11764,"resource_url":11765,"first_page":63,"last_page":63,"pdf_url":11766,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11767,"paper_type":860,"authors":11768,"abstract":11790},"lrec2018-main-617","Multi-lingual Argumentative Corpora in English, Turkish, Greek, Albanian, Croatian, Serbian, Macedonian, Bulgarian, Romanian and Arabic","10.63317\u002F3pva2b8zg2eb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-617","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F785.pdf","sliwa-etal-2018-multi",[11769,11772,11773,11775,11778,11781,11784,11787],{"paper_id":11762,"author_seq":247,"given_name":11770,"surname":11771,"affiliation":63,"orcid":63},"Alfred","Sliwa",{"paper_id":11762,"author_seq":232,"given_name":6602,"surname":3400,"affiliation":63,"orcid":63},{"paper_id":11762,"author_seq":218,"given_name":11774,"surname":5232,"affiliation":63,"orcid":63},"Ruishen",{"paper_id":11762,"author_seq":203,"given_name":11776,"surname":11777,"affiliation":63,"orcid":63},"Niravkumar","Borad",{"paper_id":11762,"author_seq":188,"given_name":11779,"surname":11780,"affiliation":63,"orcid":63},"Seyedeh","Ziyaei",{"paper_id":11762,"author_seq":172,"given_name":11782,"surname":11783,"affiliation":63,"orcid":63},"Mina","Ghobadi",{"paper_id":11762,"author_seq":155,"given_name":11785,"surname":11786,"affiliation":63,"orcid":63},"Firas","Sabbah",{"paper_id":11762,"author_seq":138,"given_name":11788,"surname":11789,"affiliation":63,"orcid":63},"Ahmet","Aker","Argumentative corpora are costly to create and are available in only few languages with English dominating the area. In this paper we release the first publicly available corpora in all Balkan languages and Arabic. The corpora are obtained by using parallel corpora where the source language is English and target language is either a Balkan language or Arabic. We use 8 different argument mining classifiers trained for English, apply them all on the source language and project the decision made by the classifiers to the target language. We assess the performance of the classifiers on a manually annotated news corpus. Our results show when at least 3 to 6 classifiers are used to judge a piece of text as argumentative an F1-score above 90% is obtained.",{"paper_id":11792,"title":11793,"year":81,"month":855,"day":63,"doi":11794,"resource_url":11795,"first_page":63,"last_page":63,"pdf_url":11796,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11797,"paper_type":860,"authors":11798,"abstract":11804},"lrec2018-main-618","SemR-11: A Multi-Lingual Gold-Standard for Semantic Similarity and Relatedness for Eleven Languages","10.63317\u002F49o3z9r9waa3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-618","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F863.pdf","barzegar-etal-2018-semr",[11799,11800,11801,11802,11803],{"paper_id":11792,"author_seq":247,"given_name":4782,"surname":4783,"affiliation":63,"orcid":63},{"paper_id":11792,"author_seq":232,"given_name":4785,"surname":4786,"affiliation":63,"orcid":63},{"paper_id":11792,"author_seq":218,"given_name":8478,"surname":8479,"affiliation":63,"orcid":63},{"paper_id":11792,"author_seq":203,"given_name":4791,"surname":4792,"affiliation":63,"orcid":63},{"paper_id":11792,"author_seq":188,"given_name":1477,"surname":4789,"affiliation":63,"orcid":63},"This work describes SemR-11, a multi-lingual dataset for evaluating semantic similarity and relatedness for 11 languages (German, French, Russian, Italian, Dutch, Chinese, Portuguese, Swedish, Spanish, Arabic and Persian). Semantic similarity and relatedness gold standards have been initially used to support the evaluation of semantic distance measures in the context of linguistic and knowledge resources and distributional semantic models. SemR-11 builds upon the English gold-standards of Miller & Charles (MC), Rubenstein & Goodenough (RG), WordSimilarity 353 (WS-353), and Simlex-999, providing a canonical translation for them. The final dataset consists of 15,917 word pairs and can be used to support the construction and evaluation of semantic similarity\u002Frelatedness and distributional semantic models. As a case study, the SemR-11 test collections was used to investigate how different distributional semantic models built from corpora in different languages and with different sizes perform in computing semantic relatedness similarity and relatedness tasks.",{"paper_id":11806,"title":11807,"year":81,"month":855,"day":63,"doi":11808,"resource_url":11809,"first_page":63,"last_page":63,"pdf_url":11810,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11811,"paper_type":860,"authors":11812,"abstract":11847},"lrec2018-main-619","Corpora with Part-of-Speech Annotations for Three Regional Languages of France: Alsatian, Occitan and Picard","10.63317\u002F2kaaqqdvhqhv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-619","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F288.pdf","bernhard-etal-2018-corpora",[11813,11814,11817,11819,11822,11824,11827,11829,11832,11835,11837,11839,11841,11843,11845],{"paper_id":11806,"author_seq":247,"given_name":1130,"surname":8064,"affiliation":63,"orcid":63},{"paper_id":11806,"author_seq":232,"given_name":11815,"surname":11816,"affiliation":63,"orcid":63},"Anne-Laure","Ligozat",{"paper_id":11806,"author_seq":218,"given_name":11818,"surname":1248,"affiliation":63,"orcid":63},"Fanny",{"paper_id":11806,"author_seq":203,"given_name":11820,"surname":11821,"affiliation":63,"orcid":63},"Myriam","Bras",{"paper_id":11806,"author_seq":188,"given_name":1388,"surname":11823,"affiliation":63,"orcid":63},"Magistry",{"paper_id":11806,"author_seq":172,"given_name":11825,"surname":11826,"affiliation":63,"orcid":63},"Marianne","Vergez-Couret",{"paper_id":11806,"author_seq":155,"given_name":6912,"surname":11828,"affiliation":63,"orcid":63},"Steiblé",{"paper_id":11806,"author_seq":138,"given_name":11830,"surname":11831,"affiliation":63,"orcid":63},"Pascale","Erhart",{"paper_id":11806,"author_seq":121,"given_name":11833,"surname":11834,"affiliation":63,"orcid":63},"Nabil","Hathout",{"paper_id":11806,"author_seq":104,"given_name":9015,"surname":11836,"affiliation":63,"orcid":63},"Huck",{"paper_id":11806,"author_seq":87,"given_name":9004,"surname":11838,"affiliation":63,"orcid":63},"Rey",{"paper_id":11806,"author_seq":73,"given_name":2089,"surname":11840,"affiliation":63,"orcid":63},"Reynés",{"paper_id":11806,"author_seq":55,"given_name":7568,"surname":11842,"affiliation":63,"orcid":63},"Rosset",{"paper_id":11806,"author_seq":38,"given_name":5273,"surname":11844,"affiliation":63,"orcid":63},"Sibille",{"paper_id":11806,"author_seq":17,"given_name":1615,"surname":11846,"affiliation":63,"orcid":63},"Lavergne","This article describes the creation of corpora with part-of-speech annotations for three regional languages of France: Alsatian, Occitan and Picard. These manual annotations were performed in the context of the RESTAURE project, whose goal is to develop resources and tools for these under-resourced French regional languages. The article presents the tagsets used in the annotation process as well as the resulting annotated corpora.",{"paper_id":11849,"title":11850,"year":81,"month":855,"day":63,"doi":11851,"resource_url":11852,"first_page":63,"last_page":63,"pdf_url":11853,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11854,"paper_type":860,"authors":11855,"abstract":11862},"lrec2018-main-620","Part-of-Speech Tagging for Arabic Gulf Dialect Using Bi-LSTM","10.63317\u002F4bgx23eisvz7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-620","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F483.pdf","alharbi-etal-2018-part",[11856,11857,11858,11859,11861],{"paper_id":11849,"author_seq":247,"given_name":1163,"surname":1164,"affiliation":63,"orcid":63},{"paper_id":11849,"author_seq":232,"given_name":1169,"surname":1170,"affiliation":63,"orcid":63},{"paper_id":11849,"author_seq":218,"given_name":1148,"surname":1149,"affiliation":63,"orcid":63},{"paper_id":11849,"author_seq":203,"given_name":1154,"surname":11860,"affiliation":63,"orcid":63},"AbdelAli",{"paper_id":11849,"author_seq":188,"given_name":1151,"surname":1152,"affiliation":63,"orcid":63},"Part-of-speech (POS) tagging is one of the most important addressed areas in the natural language processing (NLP). There are effective POS taggers for many languages including Arabic. However, POS research for Arabic focused mainly on Modern Standard Arabic (MSA), while less attention was directed towards Dialect Arabic (DA). MSA is the formal variant which is mainly found in news and formal text books, while DA is the informal spoken Arabic that varies among different regions in the Arab world. DA is heavily used online due to the large spread of social media, which increased research directions towards building NLP tools for DA. Most research on DA focuses on Egyptian and Levantine, while much less attention is given to the Gulf dialect. In this paper, we present a more effective POS tagger for the Arabic Gulf dialect than currently available Arabic POS taggers. Our work includes preparing a POS tagging dataset, engineering multiple sets of features, and applying two machine learning methods, namely Support Vector Machine (SVM) classifier and bi-directional Long Short Term Memory (Bi-LSTM) for sequence modeling. We have improved POS tagging for Gulf dialect from 75% accuracy using a state-of-the-art MSA POS tagger to over 91% accuracy using a Bi-LSTM labeler.",{"paper_id":11864,"title":11865,"year":81,"month":855,"day":63,"doi":11866,"resource_url":11867,"first_page":63,"last_page":63,"pdf_url":11868,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11869,"paper_type":860,"authors":11870,"abstract":11876},"lrec2018-main-621","Web-based Annotation Tool for Inflectional Language Resources","10.63317\u002F2rd753aovzwz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-621","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F868.pdf","alosaimy-atwell-2018-web",[11871,11874],{"paper_id":11864,"author_seq":247,"given_name":11872,"surname":11873,"affiliation":63,"orcid":63},"Abdulrahman","Alosaimy",{"paper_id":11864,"author_seq":232,"given_name":5665,"surname":11875,"affiliation":63,"orcid":63},"Atwell","We present Wasim, a web-based tool for semi-automatic morphosyntactic annotation of inflectional languages resources. The tool features high flexibility in segmenting tokens, editing, diacritizing, and labelling tokens and segments. Text annotation of highly inflectional languages (including Arabic) requires key functionality which we could not see in a survey of existing tools. Wasim integrates with morphological analysers to speed up the annotation process by selecting one from their proposed analyses. It integrates as well with external POS taggers for kick-start annotation and adaptive predicting based on annotations made so far. It aims to speed up the annotation by completely relying on a keyboard interface, with no mouse interaction required. Wasim has been tested on four case studies and these features proved to be useful. The source-code is released under the MIT license.",{"paper_id":11878,"title":11879,"year":81,"month":855,"day":63,"doi":11880,"resource_url":11881,"first_page":63,"last_page":63,"pdf_url":11882,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11883,"paper_type":860,"authors":11884,"abstract":11895},"lrec2018-main-622","HiNTS: A Tagset for Middle Low German","10.63317\u002F2chzp4dw4o6x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-622","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F870.pdf","barteld-etal-2018-hints",[11885,11888,11890,11892],{"paper_id":11878,"author_seq":247,"given_name":11886,"surname":11887,"affiliation":63,"orcid":63},"Fabian","Barteld",{"paper_id":11878,"author_seq":232,"given_name":11312,"surname":11889,"affiliation":63,"orcid":63},"Ihden",{"paper_id":11878,"author_seq":218,"given_name":8608,"surname":11891,"affiliation":63,"orcid":63},"Dreessen",{"paper_id":11878,"author_seq":203,"given_name":11893,"surname":11894,"affiliation":63,"orcid":63},"Ingrid","Schröder","In this paper, we describe the “Historisches Niederdeutsch Tagset” (HiNTS). This tagset has been developed for annotating parts-of-speech and morphology in Middle Low German texts, a group of historical (1200–1650) dialects of German. A non-standardized language such as Middle Low German has special conditions and requirements which have to be considered when designing a tagset for part of speech and morphology. We explain these requirements, i.e. the need to encode ambiguities while allowing the annotator to be as specific as possible, and our approach for dealing with them in the tagset. We then describe two special features of the tagset. In order to prove the benefit of these tags and corresponding annotation rules, we present example searches and the possible analyses arising from the results of such searches. Besides the usefulness of our tagset, we also considered its reliability in annotation using inter-annotator agreement experiments. The results of these experiments are presented and explained.",{"paper_id":11897,"title":11898,"year":81,"month":855,"day":63,"doi":11899,"resource_url":11900,"first_page":63,"last_page":63,"pdf_url":11901,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11902,"paper_type":860,"authors":11903,"abstract":11909},"lrec2018-main-623","Leveraging Lexical Resources and Constraint Grammar for Rule-Based Part-of-Speech Tagging in Welsh","10.63317\u002F483te4x2sztc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-623","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F885.pdf","neale-etal-2018-leveraging",[11904,11905,11907,11908],{"paper_id":11897,"author_seq":247,"given_name":1085,"surname":5852,"affiliation":63,"orcid":63},{"paper_id":11897,"author_seq":232,"given_name":2184,"surname":11906,"affiliation":63,"orcid":63},"Donnelly",{"paper_id":11897,"author_seq":218,"given_name":1726,"surname":3838,"affiliation":63,"orcid":63},{"paper_id":11897,"author_seq":203,"given_name":3835,"surname":3836,"affiliation":63,"orcid":63},"As the quantity of annotated language data and the quality of machine learning algorithms have increased over time, statistical part-of-speech (POS) taggers trained over large datasets have become as robust or better than their rule-based counterparts. However, for lesser-resourced languages such as Welsh there is simply not enough accurately annotated data to train a statistical POS tagger. Furthermore, many of the more popular rule-based taggers still require that their rules be inferred from annotated data, which while not as extensive as that required for training a statistical tagger must still be sizeable.  In this paper we describe CyTag, a rule-based POS tagger for Welsh based on the VISL Constraint Grammar parser. Leveraging lexical information from Eurfa (an open-source dictionary for Welsh), we extract lists of possible POS tags for each word token in a running text and then apply various constraints - to prune the number of possible tags until the most appropriate tag for a given token can be selected. We explain how this approach is particularly useful in dealing with some of the specific intricacies of Welsh and present an evaluation of the performance of the tagger using a manually checked test corpus of 611 Welsh sentences.",{"paper_id":11911,"title":11912,"year":81,"month":855,"day":63,"doi":11913,"resource_url":11914,"first_page":63,"last_page":63,"pdf_url":11915,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11916,"paper_type":860,"authors":11917,"abstract":11925},"lrec2018-main-624","Graph Based Semi-Supervised Learning Approach for Tamil POS tagging","10.63317\u002F25poaq4mqpm8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-624","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F932.pdf","thayaparan-etal-2018-graph",[11918,11921,11922],{"paper_id":11911,"author_seq":247,"given_name":11919,"surname":11920,"affiliation":63,"orcid":63},"Mokanarangan","Thayaparan",{"paper_id":11911,"author_seq":232,"given_name":5697,"surname":5698,"affiliation":63,"orcid":63},{"paper_id":11911,"author_seq":218,"given_name":11923,"surname":11924,"affiliation":63,"orcid":63},"Uthayasanker","Thayasivam","Parts of Speech (POS) tagging is an important pre-requisite for various Natural Language Processing tasks. POS tagging is rather challenging for morphologically rich languages such as Tamil. Being low-resourced, Tamil does not have a large POS annotated corpus to build good quality POS taggers using supervised machine learning techniques. In order to gain the maximum out of the existing Tamil POS tagged corpora, we have developed a graph-based semi-supervised learning approach to classify unlabelled data by exploiting a small sized POS labelled data set. In this approach, both labelled and unlabelled data are converted to vectors using word embeddings and a weighted graph is constructed using Mahalanobis distance. Then semi-supervised learning (SSL) algorithms are used to classify the unlabelled data. We were able to gain an accuracy of 0.8743 over an accuracy of 0.7333 produced by a CRF tagger for the same limited size corpus.",{"paper_id":11927,"title":11928,"year":81,"month":855,"day":63,"doi":11929,"resource_url":11930,"first_page":63,"last_page":63,"pdf_url":11931,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11932,"paper_type":860,"authors":11933,"abstract":11937},"lrec2018-main-625","What Causes the Differences in Communication Styles? A Multicultural Study on Directness and Elaborateness","10.63317\u002F2rj37d6duy39","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-625","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F167.pdf","miehle-etal-2018-causes",[11934,11935,11936],{"paper_id":11927,"author_seq":247,"given_name":3146,"surname":3147,"affiliation":63,"orcid":63},{"paper_id":11927,"author_seq":232,"given_name":1251,"surname":1252,"affiliation":63,"orcid":63},{"paper_id":11927,"author_seq":218,"given_name":1471,"surname":3158,"affiliation":63,"orcid":63},"With the aim of designing a Spoken Dialogue System which adapts to the user's communication idiosyncrasies, we present a multicultural study to investigate the causes of differences in the communication styles elaborateness and directness in Human-Computer Interaction. By adapting the system's behaviour to the user, the conversation agent may appear more familiar and trustworthy. 339 persons from Germany, Russia, Poland, Spain and the United Kingdom participated in this web-based study. The participants had to imagine that they are talking to a digital agent. For every dialogue turn, they had to read four different variants of the system output and indicate their preference. With the results of this study, we could demonstrate the influence of the user's culture and gender, the frequency of use of speech based assistants as well as the system's role on the user's preference concerning the system's communication style in terms of its elaborateness and its directness.",{"paper_id":11939,"title":11940,"year":81,"month":855,"day":63,"doi":11941,"resource_url":11942,"first_page":63,"last_page":63,"pdf_url":11943,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11944,"paper_type":860,"authors":11945,"abstract":11965},"lrec2018-main-626","FARMI: A FrAmework for Recording Multi-Modal Interactions","10.63317\u002F47vimtp2qx3r","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-626","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F713.pdf","jonell-etal-2018-farmi",[11946,11947,11950,11953,11954,11955,11958,11960,11961,11964],{"paper_id":11939,"author_seq":247,"given_name":1272,"surname":1273,"affiliation":63,"orcid":63},{"paper_id":11939,"author_seq":232,"given_name":11948,"surname":11949,"affiliation":63,"orcid":63},"Mattias","Bystedt",{"paper_id":11939,"author_seq":218,"given_name":11951,"surname":11952,"affiliation":63,"orcid":63},"Per","Fallgren",{"paper_id":11939,"author_seq":203,"given_name":1263,"surname":1264,"affiliation":63,"orcid":63},{"paper_id":11939,"author_seq":188,"given_name":6490,"surname":6491,"affiliation":63,"orcid":63},{"paper_id":11939,"author_seq":172,"given_name":11956,"surname":11957,"affiliation":63,"orcid":63},"Zofia","Malisz",{"paper_id":11939,"author_seq":155,"given_name":1614,"surname":11959,"affiliation":63,"orcid":63},"Mascarenhas",{"paper_id":11939,"author_seq":138,"given_name":1275,"surname":1276,"affiliation":63,"orcid":63},{"paper_id":11939,"author_seq":121,"given_name":11962,"surname":11963,"affiliation":63,"orcid":63},"Eran","Raveh",{"paper_id":11939,"author_seq":104,"given_name":3250,"surname":3251,"affiliation":63,"orcid":63},"In this paper we present (1) a processing architecture used to collect multi-modal sensor data, both for corpora collection and real-time processing, (2) an open-source implementation thereof and (3) a use-case where we deploy the architecture in a multi-party deception game, featuring six human players and one robot. The architecture is agnostic to the choice of hardware (e.g. microphones, cameras, etc.) and programming languages, although our implementation is mostly written in Python. In our use-case, different methods of capturing verbal and non-verbal cues from the participants were used. These were processed in real-time and used to inform the robot about the participants’ deceptive behaviour. The framework is of particular interest for researchers who are interested in the collection of multi-party, richly recorded corpora and the design of conversational systems. Moreover for researchers who are interested in human-robot interaction the available modules offer the possibility to easily create both autonomous and wizard-of-Oz interactions.",{"paper_id":11967,"title":11968,"year":81,"month":855,"day":63,"doi":11969,"resource_url":11970,"first_page":63,"last_page":63,"pdf_url":11971,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11972,"paper_type":860,"authors":11973,"abstract":11986},"lrec2018-main-627","Creating Large-Scale Argumentation Structures for Dialogue Systems","10.63317\u002F3mynzwj9uhgj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-627","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F719.pdf","sakai-etal-2018-creating",[11974,11977,11979,11980,11983,11985],{"paper_id":11967,"author_seq":247,"given_name":11975,"surname":11976,"affiliation":63,"orcid":63},"Kazuki","Sakai",{"paper_id":11967,"author_seq":232,"given_name":2894,"surname":11978,"affiliation":63,"orcid":63},"Inago",{"paper_id":11967,"author_seq":218,"given_name":9222,"surname":9223,"affiliation":63,"orcid":63},{"paper_id":11967,"author_seq":203,"given_name":11981,"surname":11982,"affiliation":63,"orcid":63},"Yuichiro","Yoshikawa",{"paper_id":11967,"author_seq":188,"given_name":2172,"surname":11984,"affiliation":63,"orcid":63},"Ishiguro",{"paper_id":11967,"author_seq":172,"given_name":9225,"surname":9226,"affiliation":63,"orcid":63},"We are planning to develop argumentative dialogue systems that can discuss various topics with people by using large-scale argumentation structures. In this paper, we describe the creation process of these argumentation structures. We created ten structures each having more than 2000 nodes of five topics in English and five topics in Japanese. We analyzed the created structures for their characteristics and investigated the differences between the two languages. We conducted an evaluation experiment to ascertain that the structures can be applied to dialogue systems. We conducted another experiment to use the created argumentation structures as training data for augmenting the current argumentation structures.",{"paper_id":11988,"title":11989,"year":81,"month":855,"day":63,"doi":11990,"resource_url":11991,"first_page":63,"last_page":63,"pdf_url":11992,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":11993,"paper_type":860,"authors":11994,"abstract":12009},"lrec2018-main-628","Exploring Conversational Language Generation for Rich Content about Hotels","10.63317\u002F45mzm62zq2pt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-628","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F763.pdf","walker-etal-2018-exploring",[11995,11998,12001,12004,12006],{"paper_id":11988,"author_seq":247,"given_name":11996,"surname":11997,"affiliation":63,"orcid":63},"Marilyn","Walker",{"paper_id":11988,"author_seq":232,"given_name":11999,"surname":12000,"affiliation":63,"orcid":63},"Albry","Smither",{"paper_id":11988,"author_seq":218,"given_name":12002,"surname":12003,"affiliation":63,"orcid":63},"Shereen","Oraby",{"paper_id":11988,"author_seq":203,"given_name":12005,"surname":1478,"affiliation":63,"orcid":63},"Vrindavan",{"paper_id":11988,"author_seq":188,"given_name":12007,"surname":12008,"affiliation":63,"orcid":63},"Hadar","Shemtov","Dialogue systems for hotel and tourist information have typically simplified the richness of the domain, focusing system utterances on only a few selected attributes such as price,  location and type of rooms.  However, much more content is typically  available for hotels, often as many as 50 distinct instantiated   attributes for an individual entity. New methods are needed to use   this content to generate natural dialogues for hotel information, and   in general for any domain with such rich complex content.  We   describe three experiments aimed at collecting data that can inform   an NLG for hotels dialogues, and show, not surprisingly, that the   the sentences in the original written hotel descriptions provided on   webpages for each hotel are stylistically not a very good match for   conversational interaction. We quantify the stylistic features that   characterize the differences between the original textual data and   the collected dialogic data. We plan to use these in stylistic   models for generation, and for scoring retrieved utterances for use   in hotel dialogues.",{"paper_id":12011,"title":12012,"year":81,"month":855,"day":63,"doi":12013,"resource_url":12014,"first_page":63,"last_page":63,"pdf_url":12015,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12016,"paper_type":860,"authors":12017,"abstract":12021},"lrec2018-main-629","Identification of Personal Information Shared in Chat-Oriented Dialogue","10.63317\u002F2nzdwduree2t","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-629","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F877.pdf","fillwock-traum-2018-identification",[12018,12020],{"paper_id":12011,"author_seq":247,"given_name":11312,"surname":12019,"affiliation":63,"orcid":63},"Fillwock",{"paper_id":12011,"author_seq":232,"given_name":1199,"surname":1200,"affiliation":63,"orcid":63},"We present an analysis of how personal information is shared in chat-oriented dialogue. We develop an annotation scheme, including entity-types, attributes, and values, that can be used to annotate the presence and type of personal information in these dialogues. A collection of attribute types is identified from the annotation of three chat-oriented dialogue corpora and a taxonomy of personal information pertinent to chat-oriented dialogue is presented. We examine similarities and differences in the frequency of specific attributes in the three corpora and observe that there is much overlap between the  attribute types which are shared between dialogue participants in these different settings. The work presented here suggests that there is a common set of attribute types that frequently occur within chat-oriented dialogue in general. This resource can be used in the development of chat-oriented dialogue systems by providing common topics that a dialogue system should be able to talk about.",{"paper_id":12023,"title":12024,"year":81,"month":855,"day":63,"doi":12025,"resource_url":12026,"first_page":63,"last_page":63,"pdf_url":12027,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12028,"paper_type":860,"authors":12029,"abstract":12038},"lrec2018-main-630","A Vietnamese Dialog Act Corpus Based on ISO 24617-2 standard","10.63317\u002F4vhy2zwxugyp","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-630","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F942.pdf","ngo-etal-2018-vietnamese",[12030,12033,12035],{"paper_id":12023,"author_seq":247,"given_name":12031,"surname":12032,"affiliation":63,"orcid":63},"Thi-Lan","Ngo",{"paper_id":12023,"author_seq":232,"given_name":11758,"surname":12034,"affiliation":63,"orcid":63},"Khac Linh",{"paper_id":12023,"author_seq":218,"given_name":12036,"surname":12037,"affiliation":63,"orcid":63},"Hideaki","Takeda","The voice-based human-machine interaction systems such as personal virtual assistants, chat-bots, and automatic contact centres are becoming increasingly popular. In this trend, conversation mining research also is getting the attention of many researchers. Standardized data play an important role in conversation mining. In this paper, we present a new Vietnamese corpus annotated for dialog acts using the ISO 24617-2 standard (2012), for emotions using Ekman's six primitives (1972), and for sentiment using the tags ``positive\", ``negative\" and ``neutral''. Emotion and sentiment are tagged at functional segment level. We show how the corpus is constructed and provide a brief statistical description of the data. This is the first Vietnamese dialog act corpus.",{"paper_id":12040,"title":12041,"year":81,"month":855,"day":63,"doi":12042,"resource_url":12043,"first_page":63,"last_page":63,"pdf_url":12044,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12045,"paper_type":860,"authors":12046,"abstract":12051},"lrec2018-main-631","Annotating Reflections for Health Behavior Change Therapy","10.63317\u002F3pt7yp3wwagh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-631","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F986.pdf","guntakandla-nielsen-2018-annotating",[12047,12050],{"paper_id":12040,"author_seq":247,"given_name":12048,"surname":12049,"affiliation":63,"orcid":63},"Nishitha","Guntakandla",{"paper_id":12040,"author_seq":232,"given_name":5345,"surname":5346,"affiliation":63,"orcid":63},"We present our work on annotating reflections, an essential counselor behavioral code in motivational interviewing for psychotherapy on  conversations  that  are  a  combination  of  casual  and  therapeutic  dialogue.   We        annotated  all        the  therapists’  utterances        from  ten transcripts spanning more than five hours of conversation with Complex Reflection, Simple Reflection, or No Reflection.  We also provide insights into corpus quality and code distributions.        The corpus that we constructed and annotated in this effort is a vital resource for automated health behavior change therapy via a dialogue system.  As the on-going work, additional conversations are being annotated at least by one annotator.",{"paper_id":12053,"title":12054,"year":81,"month":855,"day":63,"doi":12055,"resource_url":12056,"first_page":63,"last_page":63,"pdf_url":12057,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12058,"paper_type":860,"authors":12059,"abstract":12075},"lrec2018-main-632","Annotating Attribution Relations in Arabic","10.63317\u002F2b3shwzvyczz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-632","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F83.pdf","alsaif-etal-2018-annotating",[12060,12063,12066,12069,12072],{"paper_id":12053,"author_seq":247,"given_name":12061,"surname":12062,"affiliation":63,"orcid":63},"Amal","Alsaif",{"paper_id":12053,"author_seq":232,"given_name":12064,"surname":12065,"affiliation":63,"orcid":63},"Tasniem","Alyahya",{"paper_id":12053,"author_seq":218,"given_name":12067,"surname":12068,"affiliation":63,"orcid":63},"Madawi","Alotaibi",{"paper_id":12053,"author_seq":203,"given_name":12070,"surname":12071,"affiliation":63,"orcid":63},"Huda","Almuzaini",{"paper_id":12053,"author_seq":188,"given_name":12073,"surname":12074,"affiliation":63,"orcid":63},"Abeer","Algahtani","We present a first empirical effort in annotating attribution in Modern Standard Arabic (MSA). Identifying attributed arguments to the source is applied successfully in diverse systems such as authorship identification, information retrieval, and opinion mining. Current studies focus on using lexical terms in long texts to verify, for example, the author identity. While attribution identification in short texts is still unexplored completely due to the lack of resources such as annotated corpora and tools especially in Arabic on one hand, and the limited coverage of different attribution usages in Arabic literature, on other hand. The paper presents our guidelines for annotating attribution elements: cue, source, and the content with required syntactical and semantic features in Arabic news (Arabic TreeBank - ATB) insight of earlier studies for other languages with all required adaptation. We also develop a new annotation tool for attribution in Arabic to ensure that all instances of attribution are reliably annotated. The results of a pilot annotation are discussed in addition to the inter-annotators agreement studies towards creating the first gold standard attribution corpus for Arabic.",{"paper_id":12077,"title":12078,"year":81,"month":855,"day":63,"doi":12079,"resource_url":12080,"first_page":63,"last_page":63,"pdf_url":12081,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12082,"paper_type":860,"authors":12083,"abstract":12105},"lrec2018-main-633","The ADELE Corpus of Dyadic Social Text Conversations:Dialog Act Annotation with ISO 24617-2","10.63317\u002F2gi8fex93b55","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-633","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F407.pdf","gilmartin-etal-2018-adele",[12084,12085,12087,12090,12092,12094,12096,12099,12102,12103],{"paper_id":12077,"author_seq":247,"given_name":6558,"surname":6559,"affiliation":63,"orcid":63},{"paper_id":12077,"author_seq":232,"given_name":904,"surname":12086,"affiliation":63,"orcid":63},"Saam",{"paper_id":12077,"author_seq":218,"given_name":12088,"surname":12089,"affiliation":63,"orcid":63},"Brendan","Spillane",{"paper_id":12077,"author_seq":203,"given_name":2301,"surname":12091,"affiliation":63,"orcid":63},"O’Reilly",{"paper_id":12077,"author_seq":188,"given_name":12093,"surname":1220,"affiliation":63,"orcid":63},"Ketong",{"paper_id":12077,"author_seq":172,"given_name":12095,"surname":5031,"affiliation":63,"orcid":63},"Arturo",{"paper_id":12077,"author_seq":155,"given_name":12097,"surname":12098,"affiliation":63,"orcid":63},"Loredana","Cerrato",{"paper_id":12077,"author_seq":138,"given_name":12100,"surname":12101,"affiliation":63,"orcid":63},"Killian","Levacher",{"paper_id":12077,"author_seq":121,"given_name":3201,"surname":3202,"affiliation":63,"orcid":63},{"paper_id":12077,"author_seq":104,"given_name":11237,"surname":12104,"affiliation":63,"orcid":63},"Wade","Social or interactional dialog is less well described than task-based or instrumental dialog, although there is increasing interest in the genre, particularly in light of new spoken and text dialog applications which aim to relate to the user as well as perform tasks. Dialog act annotation aids understanding of interaction structure; essential to the design of sucessful artificial dialog. Much social text interaction may be closer to social talk than to traditional written language. In this paper we briefly describe social or casual talk, and review how current dialog annotation schemes and particularly the ISO standard 24617-2 (Semantic annotation framework, Part 2: Dialogue Acts) treat non-task elements of dialog. To aid in training a casual talk system, we collected a corpus of 193 dyadic text dialogs, based on a novel `getting to know you' social dialog elicitation paradigm. We describe the annotation of the dialogs, and propose additional acts to better cover greeting and leavetaking. We report on preliminary analyses of the corpus, and provide an overview of the corpus content and its relationship to spoken language. The corpus, coding manual, and annotations are being packaged and will be made available to interested researchers.",{"paper_id":12107,"title":12108,"year":81,"month":855,"day":63,"doi":12109,"resource_url":12110,"first_page":63,"last_page":63,"pdf_url":12111,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12112,"paper_type":860,"authors":12113,"abstract":12116},"lrec2018-main-634","An Assessment of Explicit Inter- and Intra-sentential Discourse Connectives in Turkish Discourse Bank","10.63317\u002F44p4oh9vspxc","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-634","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F518.pdf","zeyrek-kurfali-2018-assessment",[12114,12115],{"paper_id":12107,"author_seq":247,"given_name":6422,"surname":6423,"affiliation":63,"orcid":63},{"paper_id":12107,"author_seq":232,"given_name":6426,"surname":6427,"affiliation":63,"orcid":63},"The paper offers a quantitative and qualitative analysis of explicit inter- and intra-sentential discourse connectives in Turkish Discourse Bank, or TDB version 1.1, a multi-genre resource of written Turkish manually annotated at the discourse level following the goals and principles of Penn Discourse TreeBank. TDB 1.1 is a 40K-word corpus involving all major discourse relation types (explicit discourse relations at intra- and inter-sentential positions, implicit discourse relations, alternative lexicalizations and entity relations) along with their senses and the text spans they relate. The paper focuses on the addition of a new set of explicit intra-sentential connectives to TDB 1.1, namely converbs (a subset of subordinators), which are suffixal connectives mostly corresponding to subordinating conjunctions in European languages. An evaluation of the converb sense annotations is provided. Then, with corpus statistics, explicit intra- and inter-sentential connectives are compared in terms of their frequency of occurrence and with respect to the senses they convey. The results suggest that the subordinators tend to select certain senses not selected by explicit inter-sentential discourse connectives in the data. Overall, our findings offer a promising direction for future NLP tasks in Turkish.",{"paper_id":12118,"title":12119,"year":81,"month":855,"day":63,"doi":12120,"resource_url":12121,"first_page":63,"last_page":63,"pdf_url":12122,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12123,"paper_type":860,"authors":12124,"abstract":12134},"lrec2018-main-635","Compilation of Corpora for the Study of the Information Structure–Prosody Interface","10.63317\u002F32pkapa586ap","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-635","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F530.pdf","burga-etal-2018-compilation",[12125,12127,12130,12133],{"paper_id":12118,"author_seq":247,"given_name":5486,"surname":12126,"affiliation":63,"orcid":63},"Burga",{"paper_id":12118,"author_seq":232,"given_name":12128,"surname":12129,"affiliation":63,"orcid":63},"Mónica","Domínguez",{"paper_id":12118,"author_seq":218,"given_name":12131,"surname":12132,"affiliation":63,"orcid":63},"Mireia","Farrús",{"paper_id":12118,"author_seq":203,"given_name":8099,"surname":8100,"affiliation":63,"orcid":63},"Theoretical studies on the Information Structure–prosody interface argue that the content packaged in terms of theme and rheme correlates with the intonation of the corresponding sentence. However, there are few empirical studies that support this argument and even fewer resources that promote reproducibility and scalability of experiments. In this paper, we introduce a methodology for the compilation of annotated corpora to study the correspondence between Information Structure and prosody. The application of this methodology is exemplified on a corpus of read speech in English annotated with hierarchical thematicity and automatically extracted prosodic parameters.",{"paper_id":12136,"title":12137,"year":81,"month":855,"day":63,"doi":12138,"resource_url":12139,"first_page":63,"last_page":63,"pdf_url":12140,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12141,"paper_type":860,"authors":12142,"abstract":12151},"lrec2018-main-636","Preliminary Analysis of Embodied Interactions between Science Communicators and Visitors Based on a Multimodal Corpus of Japanese Conversations in a Science Museum","10.63317\u002F2tqwparkcmvj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-636","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F566.pdf","sakaida-etal-2018-preliminary",[12143,12145,12148],{"paper_id":12136,"author_seq":247,"given_name":3906,"surname":12144,"affiliation":63,"orcid":63},"Sakaida",{"paper_id":12136,"author_seq":232,"given_name":12146,"surname":12147,"affiliation":63,"orcid":63},"Ryosaku","Makino",{"paper_id":12136,"author_seq":218,"given_name":12149,"surname":12150,"affiliation":63,"orcid":63},"Mayumi","Bono","This paper introduces preliminary analyses of embodied interactions, drawing on a multimodal corpus of Japanese conversations, which we video-recorded during scientific communications at a museum in Tokyo, the Miraikan. A comparison of similar cases extracted from our multimodal corpus shows both similarities and differences, not only in language use but also in bodily conduct in certain interactional sequences. We focus on a number of sequences, such as those where science communicators invite visitors to walk to the next exhibit, and our detailed analyses show that the practices of science communicators are context-free and context-sensitive interactional procedures, adapted and adjusted to the different situations communicators may encounter. After presenting our analyses, based on a corpus from a naturally occurring but partly controlled setting, we suggest that we can investigate both the generality and the situatedness of interactional practices. In the future, using such multimodal corpora, we will be able to both qualitatively and quantitatively analyze language use and non-verbal behaviors in situated activities.",{"paper_id":12153,"title":12154,"year":81,"month":855,"day":63,"doi":12155,"resource_url":12156,"first_page":63,"last_page":63,"pdf_url":12157,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12158,"paper_type":860,"authors":12159,"abstract":12168},"lrec2018-main-637","Improving Crowdsourcing-Based Annotation of Japanese Discourse Relations","10.63317\u002F28vbj6p6xyoq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-637","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F640.pdf","kishimoto-etal-2018-improving",[12160,12163,12165,12166,12167],{"paper_id":12153,"author_seq":247,"given_name":12161,"surname":12162,"affiliation":63,"orcid":63},"Yudai","Kishimoto",{"paper_id":12153,"author_seq":232,"given_name":2169,"surname":12164,"affiliation":63,"orcid":63},"Sawada",{"paper_id":12153,"author_seq":218,"given_name":6160,"surname":6161,"affiliation":63,"orcid":63},{"paper_id":12153,"author_seq":203,"given_name":1879,"surname":1880,"affiliation":63,"orcid":63},{"paper_id":12153,"author_seq":188,"given_name":1882,"surname":1883,"affiliation":63,"orcid":63},"Although discourse parsing is an important and fundamental task in natural language processing, few languages have corpora annotated with discourse relations and if any, they are small in size. Creating a new corpus of discourse relations by hand is costly and time-consuming. To cope with this problem, Kawahara et al. (2014) constructed a Japanese corpus with discourse annotations through crowdsourcing. However, they did not evaluate the quality of the annotation. In this paper, we evaluate the quality of the annotation using expert annotations. We find out that crowdsourcing-based annotation still leaves much room for improvement. Based on the error analysis, we propose improvement techniques based on language tests. We re-annotated the corpus with discourse annotations using the improvement techniques, and achieved approximately 3% improvement in F-measure. We will make re-annotated data publicly available.",{"paper_id":12170,"title":12171,"year":81,"month":855,"day":63,"doi":12172,"resource_url":12173,"first_page":63,"last_page":63,"pdf_url":12174,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12175,"paper_type":860,"authors":12176,"abstract":12183},"lrec2018-main-638","Persian Discourse Treebank and coreference corpus","10.63317\u002F4dnao5cnticu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-638","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F673.pdf","mirzaei-safari-2018-persian",[12177,12180],{"paper_id":12170,"author_seq":247,"given_name":12178,"surname":12179,"affiliation":63,"orcid":63},"Azadeh","Mirzaei",{"paper_id":12170,"author_seq":232,"given_name":12181,"surname":12182,"affiliation":63,"orcid":63},"Pegah","Safari","This research addresses the investigation of intra-document relations based on two major approaches: discourse analysis and coreference resolution which results in building the first Persian discourse Treebank and a comprehensive Persian coreference corpus. In discourse analysis, we have explored sentence-level relations defined between clauses in complex sentences. So we specified 34682 discourse relations, the sense of the relations, their arguments and their attributes mainly consisted of the source of the message and its type. Our discourse analysis is based on a corpus consisted of 30000 individual sentences with morphologic, syntactic and semantic labels and nearly half a million tokens. Also 18336 of these sentences are double-annotated. For coreference annotation, since a document-based corpus was needed, we prepared a new corpus consisted of 547 documents and 212646 tokens which is still under development. We enriched it with morphological and syntactical labels and added coreference information at the top. Currently, we have annotated 6511 coreference chains and 21303 mentions with a comprehensive annotation scheme to compensate some specification of Persian such as being pro-drop or lacking gender agreement information.",{"paper_id":12185,"title":12186,"year":81,"month":855,"day":63,"doi":12187,"resource_url":12188,"first_page":63,"last_page":63,"pdf_url":12189,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12190,"paper_type":860,"authors":12191,"abstract":12199},"lrec2018-main-639","Automatic Labeling of Problem-Solving Dialogues for Computational Microgenetic Learning Analytics","10.63317\u002F47id9fcwftu8","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-639","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F997.pdf","meng-etal-2018-automatic",[12192,12194,12196],{"paper_id":12185,"author_seq":247,"given_name":12193,"surname":2720,"affiliation":63,"orcid":63},"Yuanliang",{"paper_id":12185,"author_seq":232,"given_name":884,"surname":12195,"affiliation":63,"orcid":63},"Rumshisky",{"paper_id":12185,"author_seq":218,"given_name":12197,"surname":12198,"affiliation":63,"orcid":63},"Florence","Sullivan","This paper presents a recurrent neural network model to automate the analysis of students' computational thinking in problem-solving dialogue.  We have collected and annotated dialogue transcripts from middle school students solving a robotics challenge, and each dialogue turn is assigned a code. We use sentence embeddings and speaker identities as features, and experiment with linear chain CRFs and RNNs with a CRF layer (LSTM-CRF). Both the linear chain CRF model and the LSTM-CRF model outperform the naive baselines by a large margin, and LSTM-CRF has an edge between the two. To our knowledge, this is the first study on dialogue segment annotation using neural network models. This study is also a stepping-stone to automating the microgenetic analysis of cognitive interactions between students.",{"paper_id":12201,"title":12202,"year":81,"month":855,"day":63,"doi":12203,"resource_url":12204,"first_page":63,"last_page":63,"pdf_url":12205,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12206,"paper_type":860,"authors":12207,"abstract":12220},"lrec2018-main-640","Increasing Argument Annotation Reproducibility by Using Inter-annotator Agreement to Improve Guidelines","10.63317\u002F5fpa74ypio28","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-640","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1048.pdf","teruel-etal-2018-increasing",[12208,12211,12214,12215,12217],{"paper_id":12201,"author_seq":247,"given_name":12209,"surname":12210,"affiliation":63,"orcid":63},"Milagro","Teruel",{"paper_id":12201,"author_seq":232,"given_name":12212,"surname":12213,"affiliation":63,"orcid":63},"Cristian","Cardellino",{"paper_id":12201,"author_seq":218,"given_name":8199,"surname":12213,"affiliation":63,"orcid":63},{"paper_id":12201,"author_seq":203,"given_name":1172,"surname":12216,"affiliation":63,"orcid":63},"Alonso Alemany",{"paper_id":12201,"author_seq":188,"given_name":12218,"surname":12219,"affiliation":63,"orcid":63},"Serena","Villata","In this abstract we present a methodology to improve Argument annotation guidelines by exploiting inter-annotator agreement measures. After a first stage of the annotation effort, we have detected problematic issues via an analysis of inter-annotator agreement. We have detected ill-defined concepts, which we have addressed by redefining high-level annotation goals. For other concepts, that are well-delimited but complex, the annotation protocol has been extended and detailed.  Moreover, as can be expected, we show that distinctions where human annotators have less agreement are also those where automatic analyzers perform worse. Thus, the reproducibility of results of Argument Mining systems can be addressed by improving inter-annotator agreement in the training material.  Following this methodology, we are enhancing a corpus annotated with argumentation, available at https:\u002F\u002Fgithub.com\u002FPLN-FaMAF\u002FArgumentMiningECHR together with guidelines and analyses of agreement. These analyses can be used to filter performance figures of automated systems, with lower penalties for cases where human annotators agree less.",{"paper_id":12222,"title":12223,"year":81,"month":855,"day":63,"doi":12224,"resource_url":12225,"first_page":63,"last_page":63,"pdf_url":12226,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12227,"paper_type":860,"authors":12228,"abstract":12231},"lrec2018-main-641","Semi-Supervised Clustering for Short Answer Scoring","10.63317\u002F45z7js2bo39m","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-641","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F427.pdf","horbach-pinkal-2018-semi",[12229,12230],{"paper_id":12222,"author_seq":247,"given_name":2516,"surname":7471,"affiliation":63,"orcid":63},{"paper_id":12222,"author_seq":232,"given_name":1058,"surname":1059,"affiliation":63,"orcid":63},"This paper investigates the use of semi-supervised clustering for Short Answer Scoring (SAS). In SAS, clustering techniques are an attractive alternative to classification because they provide structured groups of answers in addition to a score.   Previous approaches use unsupervised clustering and have teachers label some items after clustering. We propose to re-allocate some of the human annotation effort to before and during the clustering process for (i) feature selection, (ii) for creating pairwise constraints and (iii) for metric learning. Our methods improve clustering performance substantially from 0.504 kappa for unsupervised clustering to 0.566.",{"paper_id":12233,"title":12234,"year":81,"month":855,"day":63,"doi":12235,"resource_url":12236,"first_page":63,"last_page":63,"pdf_url":12237,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12238,"paper_type":860,"authors":12239,"abstract":12244},"lrec2018-main-642","Analyzing Vocabulary Commonality Index Using Large-scaled Database of Child Language Development","10.63317\u002F5ea8ctvo7zg9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-642","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F623.pdf","cao-etal-2018-analyzing",[12240,12241,12242,12243],{"paper_id":12233,"author_seq":247,"given_name":9207,"surname":7336,"affiliation":63,"orcid":63},{"paper_id":12233,"author_seq":232,"given_name":7427,"surname":7428,"affiliation":63,"orcid":63},{"paper_id":12233,"author_seq":218,"given_name":7432,"surname":7433,"affiliation":63,"orcid":63},{"paper_id":12233,"author_seq":203,"given_name":7430,"surname":3802,"affiliation":63,"orcid":63},"The present study proposed a vocabulary commonality index for child language development to investigate to what extent each child acquires common words during the early stages of lexical development. We used large-scaled, vocabulary-checklist data from Japanese-speaking children (N=1,451) aged 8-48 months to estimate their age of acquisition (AoA) of 2688 words by logistic regression. Then we calculated the vocabulary commonality index for each child with two datasets. The results showed that as their vocabulary size increases, children who have the same vocabulary size tend to produce common words with the same ratio.",{"paper_id":12246,"title":12247,"year":81,"month":855,"day":63,"doi":12248,"resource_url":12249,"first_page":63,"last_page":63,"pdf_url":12250,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12251,"paper_type":860,"authors":12252,"abstract":12259},"lrec2018-main-643","The ICoN Corpus of Academic Written Italian (L1 and L2)","10.63317\u002F55qcgrq3jy8b","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-643","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F823.pdf","tavosanis-cominetti-2018-icon",[12253,12256],{"paper_id":12246,"author_seq":247,"given_name":12254,"surname":12255,"affiliation":63,"orcid":63},"Mirko","Tavosanis",{"paper_id":12246,"author_seq":232,"given_name":12257,"surname":12258,"affiliation":63,"orcid":63},"Federica","Cominetti","This paper describes the ICoN corpus, a corpus of academic written Italian, some of the directions of research it could open, and some of the first outcomes of research conducted on it. The ICoN corpus includes 2,115,000 tokens written by students having Italian as L2 students (level B2 or higher) and 1,769,000 tokens written by students having Italian as L1; this makes it the largest corpus of its kind. The texts included in the corpus come from the online examinations taken by 787 different students for the ICoN Degree Program in Italian Language and Culture for foreign students and Italian citizens residing abroad. The texts were produced by students having 41 different L1s, and 18 different L1s are represented in the corpus by more than 20,000 tokens. The corpus is encoded in XML files; it can be freely queried online and it is available upon request for research purposes. The paper includes the discussion of preliminary research in the field of collocations, showing that, in the texts included in the corpus, while learners and natives do use multiword expressions in a similar way, learners can overuse relatively infrequent forms of multiword adverbials, or use some adverbials in a non-standard way.",{"paper_id":12261,"title":12262,"year":81,"month":855,"day":63,"doi":12263,"resource_url":12264,"first_page":63,"last_page":63,"pdf_url":12265,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12266,"paper_type":860,"authors":12267,"abstract":12276},"lrec2018-main-644","Revita: a Language-learning Platform at the Intersection of ITS and CALL","10.63317\u002F5f2yjftgogkj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-644","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F841.pdf","katinskaia-etal-2018-revita",[12268,12271,12274],{"paper_id":12261,"author_seq":247,"given_name":12269,"surname":12270,"affiliation":63,"orcid":63},"Anisia","Katinskaia",{"paper_id":12261,"author_seq":232,"given_name":12272,"surname":12273,"affiliation":63,"orcid":63},"Javad","Nouri",{"paper_id":12261,"author_seq":218,"given_name":2083,"surname":12275,"affiliation":63,"orcid":63},"Yangarber","This paper presents Revita, a Web-based platform for language learning—beyond the beginner level. We anchor the presentation in a survey, where we review the literature about recent advances in the fields of computer-aided language learning (CALL) and intelligent tutoring systems (ITS). We outline the established desiderata of CALL and ITS and discuss how Revita addresses (the majority of) the theoretical requirements of CALL and ITS. Finally, we claim that, to the best of our knowledge, Revita is currently the only platform for learning\u002Ftutoring beyond the beginner level, that is functional, freely-available and supports multiple languages.",{"paper_id":12278,"title":12279,"year":81,"month":855,"day":63,"doi":12280,"resource_url":12281,"first_page":63,"last_page":63,"pdf_url":12282,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12283,"paper_type":860,"authors":12284,"abstract":12289},"lrec2018-main-645","The Distribution and Prosodic Realization of Verb Forms in German Infant-Directed Speech","10.63317\u002F2oscxptrt89n","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-645","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F845.pdf","braun-zahner-2018-distribution",[12285,12287],{"paper_id":12278,"author_seq":247,"given_name":7844,"surname":12286,"affiliation":63,"orcid":63},"Braun",{"paper_id":12278,"author_seq":232,"given_name":8608,"surname":12288,"affiliation":63,"orcid":63},"Zahner","Infant-directed speech is often seen as a predictor for infants' speech processing abilities, for instance speech segmentation or word learning. In this paper, we examine the syntactic distribution (position), accentuation and prosodic phrasing of German verb forms and discuss that many verb forms are prime candidates for early segmentation: they frequently appear at the start or end of prosodic phrases; if they are not phrase-initial, they are often preceded by closed-class word forms and they are frequently accented (imperative verb forms: 72% of the cases, infinitive verb forms: 82% of the cases). It thus appears that German infants ought to be able to extract verbs as early as nouns, given appropriate stimulus materials.",{"paper_id":12291,"title":12292,"year":81,"month":855,"day":63,"doi":12293,"resource_url":12294,"first_page":63,"last_page":63,"pdf_url":12295,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12296,"paper_type":860,"authors":12297,"abstract":12305},"lrec2018-main-646","Cross-linguistically Small World Networks are Ubiquitous in Child-directed Speech","10.63317\u002F4zy5yfaceff7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-646","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F887.pdf","moran-etal-2018-cross",[12298,12299,12302],{"paper_id":12291,"author_seq":247,"given_name":1085,"surname":3346,"affiliation":63,"orcid":63},{"paper_id":12291,"author_seq":232,"given_name":12300,"surname":12301,"affiliation":63,"orcid":63},"Danica","Pajović",{"paper_id":12291,"author_seq":218,"given_name":12303,"surname":12304,"affiliation":63,"orcid":63},"Sabine","Stoll","In this paper we use network theory to model graphs of child-directed speech from caregivers of children from nine typologically and morphologically diverse languages. With the resulting lexical adjacency graphs, we calculate the network statistics {N, E, \u003Ck>, L, C} and compare them against the standard baseline of the same parameters from randomly generated networks of the same size. We show that typologically and morphologically diverse languages all share small world properties in their child-directed speech. Our results add to the repertoire of universal distributional patterns found in the input to children cross-linguistically. We discuss briefly some implications for language acquisition research.",{"paper_id":12307,"title":12308,"year":81,"month":855,"day":63,"doi":12309,"resource_url":12310,"first_page":63,"last_page":63,"pdf_url":12311,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12312,"paper_type":860,"authors":12313,"abstract":12317},"lrec2018-main-647","L1-L2 Parallel Treebank of Learner Chinese: Overused and Underused Syntactic Structures","10.63317\u002F4mxrwpu62cwf","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-647","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F901.pdf","li-lee-2018-l1",[12314,12316],{"paper_id":12307,"author_seq":247,"given_name":12315,"surname":1591,"affiliation":63,"orcid":63},"Keying",{"paper_id":12307,"author_seq":232,"given_name":1734,"surname":4032,"affiliation":63,"orcid":63},"We present a preliminary analysis on a corpus of texts written by learners of Chinese as a foreign language (CFL), annotated in the form of an L1-L2 parallel dependency treebank.  The treebank consists of parse trees of sentences written by CFL learners (“L2 sentences”), parse trees of their target hypotheses (“L1 sentences”), and word alignment between the L1 sentences and L2 sentences.  Currently, the treebank consists of 600 L2 sentences and 697 L1 sentences.  We report the most overused and underused syntactic relations by the CFL learners, and discuss the underlying learner errors.",{"paper_id":12319,"title":12320,"year":81,"month":855,"day":63,"doi":12321,"resource_url":12322,"first_page":63,"last_page":63,"pdf_url":12323,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12324,"paper_type":860,"authors":12325,"abstract":12334},"lrec2018-main-648","The Use of Text Alignment in Semi-Automatic Error Analysis: Use Case in the Development of the Corpus of the Latvian Language Learners","10.63317\u002F32pow6apdbqv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-648","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F933.pdf","dargis-etal-2018-use",[12326,12328,12331],{"paper_id":12319,"author_seq":247,"given_name":1701,"surname":12327,"affiliation":63,"orcid":63},"Darģis",{"paper_id":12319,"author_seq":232,"given_name":12329,"surname":12330,"affiliation":63,"orcid":63},"Ilze","Auziņa",{"paper_id":12319,"author_seq":218,"given_name":12332,"surname":12333,"affiliation":63,"orcid":63},"Kristīne","Levāne-Petrova","This article presents a different method for creation of error annotated corpora. The approach suggested in this paper consists of multiple parts - text correction, automated morphological analysis, automated text alignment and error annotation. Error annotation can easily be semi-automated with a rule-based system, similar to the one used in this paper. The text correction can also be semi-automated using a rule-based system or even machine learning. The use of the text correction, word, and letter alignment enables more in-depth analysis of errors types, providing opportunities for quantitative research. The proposed method has been approbated in the development of the corpus of the Latvian language learners. Spelling, punctuation, grammatical, syntactic and lexical errors are annotated in the corpus. Text that is not understandable is marked as unclear for additional analysis.  The method can easily be adapted for the development of error corpora in any other languages with relatively free word order. The highest gain from this method will be for highly inflected languages with rich morphology.",{"paper_id":12336,"title":12337,"year":81,"month":855,"day":63,"doi":12338,"resource_url":12339,"first_page":63,"last_page":63,"pdf_url":12340,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12341,"paper_type":860,"authors":12342,"abstract":12347},"lrec2018-main-649","Error annotation in a Learner Corpus of Portuguese","10.63317\u002F4hr8whsjnsw7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-649","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1002.pdf","del-rio-mendes-2018-error",[12343,12346],{"paper_id":12336,"author_seq":247,"given_name":12344,"surname":12345,"affiliation":63,"orcid":63},"Iria","del Río",{"paper_id":12336,"author_seq":232,"given_name":3893,"surname":3894,"affiliation":63,"orcid":63},"We present the error tagging system of the COPLE2 corpus and the first results of its implementation.. The system takes advantage of the corpus architecture and the possibilities of the TEITOK environment to reduce manual effort and produce a final standoff, multi-level annotation with position-based tags that account for the main error types observed in the corpus. The first step of the tagging process involves the manual annotation of errors at the token level. We have already annotated 47% of the corpus using this approach. In a further step, the token-based annotations will be automatically transformed (fully or partially) in position-based error tags. COPLE2 is the first Portuguese learner corpus with error annotation. We expect that this work will support new research in different fields connected with Portuguese as second\u002Fforeign language, like Second Language Acquisition\u002FTeaching or Computer Assisted Learning.",{"paper_id":12349,"title":12350,"year":81,"month":855,"day":63,"doi":12351,"resource_url":12352,"first_page":63,"last_page":63,"pdf_url":12353,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12354,"paper_type":860,"authors":12355,"abstract":12359},"lrec2018-main-650","An SLA Corpus Annotated with Pedagogically Relevant Grammatical Structures","10.63317\u002F2us9424hmn4g","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-650","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1050.pdf","zilio-etal-2018-sla",[12356,12357,12358],{"paper_id":12349,"author_seq":247,"given_name":1956,"surname":1957,"affiliation":63,"orcid":63},{"paper_id":12349,"author_seq":232,"given_name":1953,"surname":1954,"affiliation":63,"orcid":63},{"paper_id":12349,"author_seq":218,"given_name":1959,"surname":1960,"affiliation":63,"orcid":63},"The evaluation of a language learner's proficiency in second language is a task that normally involves comparing the learner's production with a learning framework of the target language. A broad framework is the Common European Framework for Languages (CEFR), which addresses language learning in general and is broadly used in the European Union, while serving as reference in countries outside the EU as well. In this study, we automatically annotated a corpus of texts produced by language learners with pedagogically relevant grammatical structures and we observed how these structures are being employed by learners from different proficiency levels. We analyzed the use of structures both in terms of evolution along the levels and in terms  of level in which the structures are used the most. The annotated resource, SGATe, presents a rich source of information for teachers that wish to compare the production of their students with those of already certified language learners.",{"paper_id":12361,"title":12362,"year":81,"month":855,"day":63,"doi":12363,"resource_url":12364,"first_page":63,"last_page":63,"pdf_url":12365,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12366,"paper_type":860,"authors":12367,"abstract":12374},"lrec2018-main-651","Portable Spelling Corrector for a Less-Resourced Language: Amharic","10.63317\u002F5i63t7yj84au","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-651","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F135.pdf","gezmu-etal-2018-portable",[12368,12371,12373],{"paper_id":12361,"author_seq":247,"given_name":12369,"surname":12370,"affiliation":63,"orcid":63},"Andargachew Mekonnen","Gezmu",{"paper_id":12361,"author_seq":232,"given_name":3238,"surname":12372,"affiliation":63,"orcid":63},"Nürnberger",{"paper_id":12361,"author_seq":218,"given_name":7217,"surname":7218,"affiliation":63,"orcid":63},"This paper describes an automatic spelling corrector for Amharic, the working language of the Federal Government of Ethiopia. We used a corpus-driven approach with the noisy channel for spelling correction. It infers linguistic knowledge from a text corpus. The approach can be ported to other written languages with little effort as long as they are typed using a QWERTY keyboard with direct mappings between keystrokes and characters. Since Amharic letters are syllabic, we used a modified version of the System for Ethiopic Representation in ASCII for transliteration in the like manner as most Amharic keyboard input methods do. The proposed approach is evaluated with Amharic and English test data and has scored better performance result than the baseline systems: GNU Aspell and Hunspell. We get better result due to the smoothed language model, the generalized error model and the ability to take into account the context of misspellings. Besides, instead of using a handcrafted lexicon for spelling error detection, we used a term list derived from frequently occurring terms in a text corpus. Such a term list, in addition to ease of compilation, has also an advantage in handling rare terms, proper nouns, and neologisms.",{"paper_id":12376,"title":12377,"year":81,"month":855,"day":63,"doi":12378,"resource_url":12379,"first_page":63,"last_page":63,"pdf_url":12380,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12381,"paper_type":860,"authors":12382,"abstract":12388},"lrec2018-main-652","A Speaking Atlas of the Regional Languages of France","10.63317\u002F3gjjr2k8atow","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-652","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F221.pdf","boula-de-mareuil-etal-2018-speaking",[12383,12384,12386],{"paper_id":12376,"author_seq":247,"given_name":2089,"surname":10257,"affiliation":63,"orcid":63},{"paper_id":12376,"author_seq":232,"given_name":1685,"surname":12385,"affiliation":63,"orcid":63},"Rilliard",{"paper_id":12376,"author_seq":218,"given_name":1136,"surname":12387,"affiliation":63,"orcid":63},"Vernier","The aim is to show and promote the linguistic diversity of France, through field recordings, a computer program (which allows us to visualise dialectal areas) and an orthographic transcription (which represents an object of research in itself). A website is presented (https:\u002F\u002Fatlas.limsi.fr), displaying an interactive map of France from which Aesop’s fable “The North Wind and the Sun” can be listened to and read in French and in 140 varieties of regional languages. There is thus both a scientific dimension and a heritage dimension in this work, insofar as a number of regional or minority languages are in a critical situation.",{"paper_id":12390,"title":12391,"year":81,"month":855,"day":63,"doi":12392,"resource_url":12393,"first_page":63,"last_page":63,"pdf_url":12394,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12395,"paper_type":860,"authors":12396,"abstract":12404},"lrec2018-main-653","Towards Language Technology for Mi’kmaq","10.63317\u002F5buswizdi5ix","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-653","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F372.pdf","maheshwari-etal-2018-towards",[12397,12399,12402],{"paper_id":12390,"author_seq":247,"given_name":12398,"surname":5054,"affiliation":63,"orcid":63},"Anant",{"paper_id":12390,"author_seq":232,"given_name":12400,"surname":12401,"affiliation":63,"orcid":63},"Léo","Bouscarrat",{"paper_id":12390,"author_seq":218,"given_name":3690,"surname":12403,"affiliation":63,"orcid":63},"Cook","Mi'kmaq is a polysynthetic Indigenous language spoken primarily in Eastern Canada, on which no prior computational work has focused. In this paper we first construct and analyze a web corpus of Mi'kmaq. We then evaluate several approaches to language modelling for Mi'kmaq, including character-level models that are particularly well-suited to morphologically-rich languages. Preservation of Indigenous languages is particularly important in the current Canadian context; we argue that natural language processing could aid such efforts.",{"paper_id":12406,"title":12407,"year":81,"month":855,"day":63,"doi":12408,"resource_url":12409,"first_page":63,"last_page":63,"pdf_url":12410,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12411,"paper_type":860,"authors":12412,"abstract":12415},"lrec2018-main-654","Pronunciation Dictionaries for the Alsatian Dialects to Analyze Spelling and Phonetic Variation","10.63317\u002F27s5j77aa9tk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-654","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F403.pdf","steible-bernhard-2018-pronunciation",[12413,12414],{"paper_id":12406,"author_seq":247,"given_name":6912,"surname":11828,"affiliation":63,"orcid":63},{"paper_id":12406,"author_seq":232,"given_name":1130,"surname":8064,"affiliation":63,"orcid":63},"This article presents new pronunciation dictionaries for the under-resourced Alsatian dialects, spoken in north-eastern France. These dictionaries are compared with existing phonetic transcriptions of Alsatian, German and French in order to analyze the relationship between speech and writing. The Alsatian dialects do not have a standardized spelling system, despite a literary history that goes back to the beginning of the 19th century. As a consequence, writers often use their own spelling systems, more or less based on German and often with some specifically French characters. But none of these systems can be seen as fully canonical. In this paper, we present the findings of an analysis of the spelling systems used in four different Alsatian datasets, including three newly transcribed lexicons, and describe how they differ by taking the phonetic transcriptions into account. We also detail experiments with a grapheme-to-phoneme (G2P) system trained on manually transcribed data and show that the combination of both spelling and phonetic variation presents specific challenges.",{"paper_id":12417,"title":12418,"year":81,"month":855,"day":63,"doi":12419,"resource_url":12420,"first_page":63,"last_page":63,"pdf_url":12421,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12422,"paper_type":860,"authors":12423,"abstract":12433},"lrec2018-main-655","ChAnot: An Intelligent Annotation Tool for Indigenous and Highly Agglutinative Languages in Peru","10.63317\u002F35zyicw3n26c","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-655","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F463.pdf","mercado-gonzales-etal-2018-chanot",[12424,12427,12429,12431],{"paper_id":12417,"author_seq":247,"given_name":12425,"surname":12426,"affiliation":63,"orcid":63},"Rodolfo","Mercado-Gonzales",{"paper_id":12417,"author_seq":232,"given_name":6490,"surname":12428,"affiliation":63,"orcid":63},"Pereira-Noriega",{"paper_id":12417,"author_seq":218,"given_name":922,"surname":12430,"affiliation":63,"orcid":63},"Sobrevilla",{"paper_id":12417,"author_seq":203,"given_name":12095,"surname":12432,"affiliation":63,"orcid":63},"Oncevay","Linguistic corpus annotation is one of the most important phases for addressing Natural Language Processing (NLP) tasks, as these methods are deeply involved with corpus-based techniques. However, meta-data annotation is a highly laborious manual task. A supportive alternative requires the use of computational tools. They are likely to simplify some of these operations, while can be adjusted appropriately to the needs of particular language features at the same time. Therefore, this paper presents ChAnot, a web-based annotation tool developed for Peruvian indigenous and highly agglutinative languages, where Shipibo-Konibo was the case study. This new tool is able to support a diverse set of linguistic annotation tasks, such as morphological segmentation markup, POS-tag markup, among others. Also, it includes a suggestion engine based on historic and machine learning models, and a set of statistics about previous annotations.",{"paper_id":12435,"title":12436,"year":81,"month":855,"day":63,"doi":12437,"resource_url":12438,"first_page":63,"last_page":63,"pdf_url":12439,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12440,"paper_type":860,"authors":12441,"abstract":12450},"lrec2018-main-656","The DLDP Survey on Digital Use and Usability of EU Regional and Minority Languages","10.63317\u002F3pwnvoyu9ofo","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-656","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F684.pdf","soria-etal-2018-dldp",[12442,12444,12447],{"paper_id":12435,"author_seq":247,"given_name":10215,"surname":12443,"affiliation":63,"orcid":63},"Soria",{"paper_id":12435,"author_seq":232,"given_name":12445,"surname":12446,"affiliation":63,"orcid":63},"Valeria","Quochi",{"paper_id":12435,"author_seq":218,"given_name":12448,"surname":12449,"affiliation":63,"orcid":63},"Irene","Russo","The  digital development of regional and minority languages requires careful planning to be effective and should be preceded by the identification of the current and actual extent to which those languages are used digitally, the type and frequency of their digital use, the opportunity for their use, and the main obstacles currently preventing it. This paper reports about the design, the results and the key findings of an exploratory survey launched by the Digital Language Diversity Project about the digital use and usability of regional and minority languages on digital media and devices. The aim of the survey - the first of this kind - was to investigate the real usage, needs and expectations of European minority language speakers regarding digital opportunities, with a strong focus on electronic communication. The survey is restricted to four languages (Basque, Breton, Karelian and Sardinian) at different stages of digital development, which offers a starting point to develop strategies for assessing digital vitality of these languages and overcoming specific difficulties such as, for instance, the lack of official data.",{"paper_id":12452,"title":12453,"year":81,"month":855,"day":63,"doi":12454,"resource_url":12455,"first_page":63,"last_page":63,"pdf_url":12456,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12457,"paper_type":860,"authors":12458,"abstract":12464},"lrec2018-main-657","ASR for Documenting Acutely Under-Resourced Indigenous Languages","10.63317\u002F3w22zkanvb4z","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-657","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F749.pdf","jimerson-prudhommeaux-2018-asr",[12459,12462],{"paper_id":12452,"author_seq":247,"given_name":12460,"surname":12461,"affiliation":63,"orcid":63},"Robbie","Jimerson",{"paper_id":12452,"author_seq":232,"given_name":8360,"surname":12463,"affiliation":63,"orcid":63},"Prud’hommeaux","Despite its potential utility for facilitating the transcription of speech recordings, automatic speech recognition (ASR) has not been widely explored as a tool for documenting endangered languages. One obstacle to adopting ASR for this purpose is that the amount of data needed to build a reliable ASR system far exceeds what would typically be available in an endangered language. Languages with highly complex morphology present further data sparsity challenges. In this paper, we present a working ASR system for Seneca, an endangered indigenous language of North America, as a case study for the development of ASR for acutely low-resource languages in need of linguistic documentation. We explore methods of leveraging linguistic knowledge to improve the ASR language models for a polysynthetic language with few high-quality audio and text resources, and we propose a tool for using ASR output to bootstrap new data to iteratively improve the acoustic model. This work serves as a proof-of-concept for speech researchers interested helping field linguists and indigenous language community members engaged in the documentation and revitalization of endangered languages.",{"paper_id":12466,"title":12467,"year":81,"month":855,"day":63,"doi":12468,"resource_url":12469,"first_page":63,"last_page":63,"pdf_url":12470,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12471,"paper_type":860,"authors":12472,"abstract":12479},"lrec2018-main-658","Building a Sentiment Corpus of Tweets in Brazilian Portuguese","10.63317\u002F46j4f5notz45","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-658","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F389.pdf","brum-volpe-nunes-2018-building",[12473,12476],{"paper_id":12466,"author_seq":247,"given_name":12474,"surname":12475,"affiliation":63,"orcid":63},"Henrico","Brum",{"paper_id":12466,"author_seq":232,"given_name":12477,"surname":12478,"affiliation":63,"orcid":63},"Maria das Graças","Volpe Nunes","The large amount of data available in social media, forums and websites motivates researches in several areas of Natural Language Processing, such as sentiment analysis. The popularity of the area        due to its subjective and semantic characteristics motivates research on novel methods and approaches for classification. Hence, there is a high demand for datasets on different domains and different languages. This paper introduces TweetSentBR, a sentiment corpus for Brazilian Portuguese manually annotated with 15.000 sentences on TV show domain. The sentences were labeled in three classes (positive, neutral and negative) by seven annotators, following literature guidelines for ensuring reliability on the annotation. We also ran baseline experiments on polarity classification using six machine learning classifiers, reaching 80.38% on F-Measure in binary classification and 64.87% when including the neutral class. We also performed experiments in similar datasets for polarity classification task in comparison to this corpus.",{"paper_id":12481,"title":12482,"year":81,"month":855,"day":63,"doi":12483,"resource_url":12484,"first_page":63,"last_page":63,"pdf_url":12485,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12486,"paper_type":860,"authors":12487,"abstract":12494},"lrec2018-main-659","‘Aye’ or ‘No’? Speech-level Sentiment Analysis of Hansard UK Parliamentary Debate Transcripts","10.63317\u002F37krdg6qf9fs","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-659","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F741.pdf","abercrombie-batista-navarro-2018-aye",[12488,12491],{"paper_id":12481,"author_seq":247,"given_name":12489,"surname":12490,"affiliation":63,"orcid":63},"Gavin","Abercrombie",{"paper_id":12481,"author_seq":232,"given_name":12492,"surname":12493,"affiliation":63,"orcid":63},"Riza","Batista-Navarro","Transcripts of UK parliamentary debates provide access to the opinions of politicians towards important topics, but due to the large quantity of textual data and the specialised language used, they are not straightforward for humans to process. We apply opinion mining methods to these transcripts to classify the sentiment polarity of speakers as being either positive or negative towards the motions proposed in the debates. We compare classification performance on a novel corpus using both manually annotated sentiment labels and labels derived from the speakers' votes (`aye' or `no'). We introduce a two-step classification model, and evaluate the performance of both one- and two-step models, and the use of a range of textual and contextual features.  Results suggest that textual features are more indicative of manually annotated class labels. Contextual metadata features however, boost performance are particularly indicative of vote labels. Use of the two-step debate model results in performance gains and appears to capture some of the complexity of the debate format. Optimum performance on this data is achieved using all features to train a multi-layer neural network, indicating that such models may be most able to exploit the relationships between textual and contextual cues in parliamentary debate speeches.",{"paper_id":12496,"title":12497,"year":81,"month":855,"day":63,"doi":12498,"resource_url":12499,"first_page":63,"last_page":63,"pdf_url":12500,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12501,"paper_type":860,"authors":12502,"abstract":12509},"lrec2018-main-660","Scalable Visualisation of Sentiment and Stance","10.63317\u002F5aktrdysie8r","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-660","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F760.pdf","chamberlain-etal-2018-scalable",[12503,12505,12506],{"paper_id":12496,"author_seq":247,"given_name":1785,"surname":12504,"affiliation":63,"orcid":63},"Chamberlain",{"paper_id":12496,"author_seq":232,"given_name":1451,"surname":8156,"affiliation":63,"orcid":63},{"paper_id":12496,"author_seq":218,"given_name":12507,"surname":12508,"affiliation":63,"orcid":63},"Orland","Hoeber","Natural language processing systems have the ability to analyse not only the sentiment of human language, but also the stance of the speaker. Representing this information visually from unevenly distributed and potentially sparse datasets is challenging, in particular when trying to facilitate exploration and knowledge discovery. We present work on a novel visualisation approach for scalable visualisation of sentiment and stance and provide a language resource of e-government public engagement of 9,278 user comments with stance explicitly declared by the author.",{"paper_id":12511,"title":12512,"year":81,"month":855,"day":63,"doi":12513,"resource_url":12514,"first_page":63,"last_page":63,"pdf_url":12515,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12516,"paper_type":860,"authors":12517,"abstract":12534},"lrec2018-main-661","NoReC: The Norwegian Review Corpus","10.63317\u002F2mfhrz244a29","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-661","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F851.pdf","velldal-etal-2018-norec",[12518,12521,12522,12525,12528,12531],{"paper_id":12511,"author_seq":247,"given_name":12519,"surname":12520,"affiliation":63,"orcid":63},"Erik","Velldal",{"paper_id":12511,"author_seq":232,"given_name":5081,"surname":5082,"affiliation":63,"orcid":63},{"paper_id":12511,"author_seq":218,"given_name":12523,"surname":12524,"affiliation":63,"orcid":63},"Eivind Alexander","Bergem",{"paper_id":12511,"author_seq":203,"given_name":12526,"surname":12527,"affiliation":63,"orcid":63},"Cathrine","Stadsnes",{"paper_id":12511,"author_seq":188,"given_name":12529,"surname":12530,"affiliation":63,"orcid":63},"Samia","Touileb",{"paper_id":12511,"author_seq":172,"given_name":12532,"surname":12533,"affiliation":63,"orcid":63},"Fredrik","Jørgensen","This paper presents the Norwegian Review Corpus (NoReC), created for training and evaluating models for document-level sentiment analysis. The full-text reviews have been collected from major Norwegian news sources and cover a range of different domains, including literature, movies, video games, restaurants, music and theater, in addition to product reviews across a range of categories.  Each review is labeled with a manually assigned score of 1--6, as provided by the rating of the original author.  This first release of the corpus comprises more than 35,000 reviews. It is distributed using the CoNLL-U format, pre-processed using UDPipe, along with a rich set of metadata. The work reported in this paper forms part of the SANT initiative (Sentiment Analysis for Norwegian Text), a project seeking to provide open resources and tools for sentiment analysis and opinion mining for Norwegian.",{"paper_id":12536,"title":12537,"year":81,"month":855,"day":63,"doi":12538,"resource_url":12539,"first_page":63,"last_page":63,"pdf_url":12540,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12541,"paper_type":860,"authors":12542,"abstract":12547},"lrec2018-main-662","SenSALDO: Creating a Sentiment Lexicon for Swedish","10.63317\u002F4puostfezf7a","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-662","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F857.pdf","rouces-etal-2018-sensaldo",[12543,12544,12545,12546],{"paper_id":12536,"author_seq":247,"given_name":8532,"surname":8533,"affiliation":63,"orcid":63},{"paper_id":12536,"author_seq":232,"given_name":8535,"surname":8536,"affiliation":63,"orcid":63},{"paper_id":12536,"author_seq":218,"given_name":8538,"surname":8539,"affiliation":63,"orcid":63},{"paper_id":12536,"author_seq":203,"given_name":8541,"surname":8542,"affiliation":63,"orcid":63},"The natural language processing subfield known as sentiment analysis or opinion mining has seen an explosive expansion over the last decade or so, and sentiment analysis has become a standard item in the NLP toolbox.  Still, many theoretical and methodological questions remain unanswered and resource gaps unfilled. Most work on automated sentiment analysis has been done on English and a few other languages; for most written languages of the world, this tool is not available. This paper describes the development of an extensive sentiment lexicon for written (standard) Swedish. We investigate different methods for developing a sentiment lexicon for Swedish. We use an existing gold standard dataset for training and testing. For each word sense from the SALDO Swedish lexicon, we assign a real value sentiment score in the range [-1,1] and produce a sentiment label. We implement and evaluate three methods: a graph-based method that iterates over the SALDO structure, a method based on random paths over the SALDO structure and a corpus-driven method based on word embeddings. The resulting sense-disambiguated sentiment lexicon (SenSALDO) is an open source resource and freely available from Språkbanken, The Swedish Language Bank at the University of Gothenburg.",{"paper_id":12549,"title":12550,"year":81,"month":855,"day":63,"doi":12551,"resource_url":12552,"first_page":63,"last_page":63,"pdf_url":12553,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12554,"paper_type":860,"authors":12555,"abstract":12569},"lrec2018-main-663","Corpus Building and Evaluation of Aspect-based Opinion Summaries from Tweets in Spanish","10.63317\u002F5fyjzpi6vgb9","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-663","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F959.pdf","penaloza-etal-2018-corpus",[12556,12558,12560,12563,12565,12567],{"paper_id":12549,"author_seq":247,"given_name":2554,"surname":12557,"affiliation":63,"orcid":63},"Peñaloza",{"paper_id":12549,"author_seq":232,"given_name":1953,"surname":12559,"affiliation":63,"orcid":63},"López",{"paper_id":12549,"author_seq":218,"given_name":12561,"surname":12562,"affiliation":63,"orcid":63},"Juanjosé","Tenorio",{"paper_id":12549,"author_seq":203,"given_name":11251,"surname":12564,"affiliation":63,"orcid":63},"Gómez",{"paper_id":12549,"author_seq":188,"given_name":12095,"surname":12566,"affiliation":63,"orcid":63},"Oncevay-Marcos",{"paper_id":12549,"author_seq":172,"given_name":4037,"surname":12568,"affiliation":63,"orcid":63},"Sobrevilla Cabezudo","This project involves the presentation and analysis of a corpus of Spanish extractive and abstractive summaries of opinions. The purpose of this work is to display a corpus of diverse summaries that could be used as a reference for academic research as we have not found one for the Spanish language as far as we know.  We have analyzed the summaries based on the agreement between them as this shows how different they are written between each other and on aspect coverage and sentiment orientation as this proves the difference between the content that each summary tries to express. After the experimentation, we have found that even if each annotator uses a different expression to summarize a text, all of them contain similar messages. Furthermore, when writing, all of them prioritize on common aspects that are more representative of the corpus.",{"paper_id":12571,"title":12572,"year":81,"month":855,"day":63,"doi":12573,"resource_url":12574,"first_page":63,"last_page":63,"pdf_url":12575,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12576,"paper_type":860,"authors":12577,"abstract":12584},"lrec2018-main-664","Application and Analysis of a Multi-layered Scheme for Irony on the Italian Twitter Corpus TWITTIRÒ","10.63317\u002F582d597s9i68","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-664","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F996.pdf","cignarella-etal-2018-application",[12578,12581,12582,12583],{"paper_id":12571,"author_seq":247,"given_name":12579,"surname":12580,"affiliation":63,"orcid":63},"Alessandra Teresa","Cignarella",{"paper_id":12571,"author_seq":232,"given_name":6011,"surname":6012,"affiliation":63,"orcid":63},{"paper_id":12571,"author_seq":218,"given_name":8846,"surname":8847,"affiliation":63,"orcid":63},{"paper_id":12571,"author_seq":203,"given_name":12254,"surname":9893,"affiliation":63,"orcid":63},"In this paper we describe the main issues emerged within the application of a multi-layered scheme for the fine-grained annotation of irony (Karoui et al., 2017) on an Italian Twitter corpus, i.e. TWITTIRÒ , which is composed of about 1,500 tweets with various provenance. A discussion is proposed about the limits and advantages of the application of the scheme to Italian messages, supported by an analysis of the outcome of the annotation carried on by native Italian speakers in the development of the corpus. We present a quantitative and qualitative study both of the distribution of the labels for the different layers involved in the scheme which can shed some light on the process of human annotation for a validation of the annotation scheme on Italian irony-laden social media contents collected in the last years. This results in a novel gold standard for irony detection in Italian, enriched with fine-grained annotations, and in a language resource available to the community and exploitable in the cross- and multi-lingual perspective which characterizes the work that inspired this research.",{"paper_id":12586,"title":12587,"year":81,"month":855,"day":63,"doi":12588,"resource_url":12589,"first_page":63,"last_page":63,"pdf_url":12590,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12591,"paper_type":860,"authors":12592,"abstract":12600},"lrec2018-main-665","Classifier-based Polarity Propagation in a WordNet","10.63317\u002F2yoxmbu8cixn","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-665","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1000.pdf","kocon-etal-2018-classifier",[12593,12595,12598],{"paper_id":12586,"author_seq":247,"given_name":2633,"surname":12594,"affiliation":63,"orcid":63},"Kocoń",{"paper_id":12586,"author_seq":232,"given_name":12596,"surname":12597,"affiliation":63,"orcid":63},"Arkadiusz","Janz",{"paper_id":12586,"author_seq":218,"given_name":2032,"surname":12599,"affiliation":63,"orcid":63},"Piasecki","In this paper we present a novel approach to the construction of an extensive, sense-level sentiment lexicon built on the basis of a wordnet. The main aim of this work is to create a high-quality sentiment lexicon in a partially automated way. We propose a method called Classifier-based Polarity Propagation, which utilises a very rich set of wordnet-based features, to recognize and assign specific sentiment polarity values to wordnet senses. We have demonstrated that in comparison to the existing rule-base solutions using specific, narrow set of semantic relations, our method allows for the construction of a more reliable sentiment lexicon, starting with the same seed of annotated synsets.",{"paper_id":12602,"title":12603,"year":81,"month":855,"day":63,"doi":12604,"resource_url":12605,"first_page":63,"last_page":63,"pdf_url":12606,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12607,"paper_type":860,"authors":12608,"abstract":12636},"lrec2018-main-666","SMILE Swiss German Sign Language Dataset","10.63317\u002F3ygyk796mmfb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-666","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F25.pdf","ebling-etal-2018-smile",[12609,12611,12614,12616,12619,12621,12622,12624,12626,12628,12631,12633],{"paper_id":12602,"author_seq":247,"given_name":11312,"surname":12610,"affiliation":63,"orcid":63},"Ebling",{"paper_id":12602,"author_seq":232,"given_name":12612,"surname":12613,"affiliation":63,"orcid":63},"Necati Cihan","Camgöz",{"paper_id":12602,"author_seq":218,"given_name":4582,"surname":12615,"affiliation":63,"orcid":63},"Boyes Braem",{"paper_id":12602,"author_seq":203,"given_name":12617,"surname":12618,"affiliation":63,"orcid":63},"Katja","Tissi",{"paper_id":12602,"author_seq":188,"given_name":6270,"surname":12620,"affiliation":63,"orcid":63},"Sidler-Miserez",{"paper_id":12602,"author_seq":172,"given_name":1205,"surname":12304,"affiliation":63,"orcid":63},{"paper_id":12602,"author_seq":155,"given_name":1269,"surname":12623,"affiliation":63,"orcid":63},"Hadfield",{"paper_id":12602,"author_seq":138,"given_name":8141,"surname":12625,"affiliation":63,"orcid":63},"Haug",{"paper_id":12602,"author_seq":121,"given_name":1070,"surname":12627,"affiliation":63,"orcid":63},"Bowden",{"paper_id":12602,"author_seq":104,"given_name":12629,"surname":12630,"affiliation":63,"orcid":63},"Sandrine","Tornay",{"paper_id":12602,"author_seq":87,"given_name":3663,"surname":12632,"affiliation":63,"orcid":63},"Razavi",{"paper_id":12602,"author_seq":73,"given_name":12634,"surname":12635,"affiliation":63,"orcid":63},"Mathew","Magimai-Doss","Sign language recognition (SLR) involves identifying the form and meaning of isolated signs or sequences of signs. To our knowledge, the combination of SLR and sign language assessment is novel. The goal of an ongoing three-year project in Switzerland is to pioneer an assessment system for lexical signs of Swiss German Sign Language (Deutschschweizerische Gebärdensprache, DSGS) that relies on SLR. The assessment system aims to give adult L2 learners of DSGS feedback on the correctness of the manual parameters (handshape, hand position, location, and movement) of isolated signs they produce. In its initial version, the system will include automatic feedback for a subset of a DSGS vocabulary production test consisting of 100 lexical items. To provide the SLR component of the assessment system with sufficient training samples, a large-scale dataset containing videotaped repeated productions of the 100 items of the vocabulary test with associated transcriptions and annotations was created, consisting of data from 11 adult L1 signers and 19 adult L2 learners of DSGS. This paper introduces the dataset, which will be made available to the research community.",{"paper_id":12638,"title":12639,"year":81,"month":855,"day":63,"doi":12640,"resource_url":12641,"first_page":63,"last_page":63,"pdf_url":12642,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12643,"paper_type":860,"authors":12644,"abstract":12651},"lrec2018-main-667","IPSL: A Database of Iconicity Patterns in Sign Languages. Creation and Use","10.63317\u002F5jd6s9ekbgvt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-667","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F102.pdf","kimmelman-etal-2018-ipsl",[12645,12647,12649],{"paper_id":12638,"author_seq":247,"given_name":1572,"surname":12646,"affiliation":63,"orcid":63},"Kimmelman",{"paper_id":12638,"author_seq":232,"given_name":884,"surname":12648,"affiliation":63,"orcid":63},"Klezovich",{"paper_id":12638,"author_seq":218,"given_name":5899,"surname":12650,"affiliation":63,"orcid":63},"Moroz","We created the first large-scale database of signs annotated according to various parameters of iconicity. The signs represent concrete concepts in seven semantic fields in nineteen sign languages; 1542 signs in total. Each sign was annotated with respect to the type of form-image association, the presence of iconic location and movement, personification, and with respect to whether the sign depicts a salient part of the concept. We also created a website: https:\u002F\u002Fsl-iconicity.shinyapps.io\u002Ficonicity_patterns\u002F with several visualization tools to represent the data from the database. It is possible to visualize iconic properties of separate concepts or iconic properties of semantic fields on the map of the world, and to build graphs representing iconic patterns for selected semantic fields. A preliminary analysis of the data shows that iconicity patterns vary across semantic fields and across languages. The database and the website can be used to further study a variety of theoretical questions related to iconicity in sign languages.",{"paper_id":12653,"title":12654,"year":81,"month":855,"day":63,"doi":12655,"resource_url":12656,"first_page":63,"last_page":63,"pdf_url":12657,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12658,"paper_type":860,"authors":12659,"abstract":12667},"lrec2018-main-668","Sign Languages and the Online World Online Dictionaries & Lexicostatistics","10.63317\u002F3usp6u39to8i","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-668","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F396.pdf","yu-etal-2018-sign",[12660,12661,12664],{"paper_id":12653,"author_seq":247,"given_name":3770,"surname":2406,"affiliation":63,"orcid":63},{"paper_id":12653,"author_seq":232,"given_name":12662,"surname":12663,"affiliation":63,"orcid":63},"Carlo","Geraci",{"paper_id":12653,"author_seq":218,"given_name":12665,"surname":12666,"affiliation":63,"orcid":63},"Natasha","Abner","Several online dictionaries documenting the lexicon of a variety of sign languages (SLs) are now available. These are rich resources for comparative studies, but there are methodological issues that must be addressed regarding how these resources are used for research purposes. We created a web-based tool for annotating the articulatory features of signs (handshape, location, movement and orientation). Videos from online dictionaries may be embedded in the tool, providing a mechanism for large-scale theoretically-informed sign language annotation. Annotations are saved in a spreadsheet format ready for quantitative and qualitative analyses. Here, we provide proof of concept for the utility of this tool in linguistic analysis. We used the SL adaptation of the Swadesh list (Woodward, 2000) and applied lexicostatistic and phylogenetic methods to a sample of 23 SLs coded using the web-based tool; supplementary historic information was gathered from the Ethnologue of World Languages and other online sources. We report results from the comparison of all articulatory features for four Asian SLs (Chinese, Hong Kong, Taiwanese and Japanese SLs) and from the comparison of handshapes on the entire 23 language sample. Handshape analysis of the entire sample clusters all Asian SLs together, separated from the European, American, and Brazilian SLs in the sample, as historically expected. Within the Asian SL cluster, analyses also show, for example, marginal relatedness between Chinese and Hong Kong SLs.",{"paper_id":12669,"title":12670,"year":81,"month":855,"day":63,"doi":12671,"resource_url":12672,"first_page":63,"last_page":63,"pdf_url":12673,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12674,"paper_type":860,"authors":12675,"abstract":12681},"lrec2018-main-669","Elicitation protocol and material for a corpus of long prepared monologues in Sign Language","10.63317\u002F2ktzfg3wberx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-669","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F441.pdf","filhol-hadjadj-2018-elicitation",[12676,12678],{"paper_id":12669,"author_seq":247,"given_name":1780,"surname":12677,"affiliation":63,"orcid":63},"Filhol",{"paper_id":12669,"author_seq":232,"given_name":12679,"surname":12680,"affiliation":63,"orcid":63},"Mohamed Nassime","Hadjadj","In this paper, we address collection of prepared Sign Language discourse, as opposed to spontaneous signing. Specifically, we aim at collecting long discourse, which creates problems explained in the paper. Being oral and visual languages, they cannot easily be produced while reading notes without distorting the data, and eliciting long discourse without influencing the production order is not trivial. For the moment, corpora contain either short productions, data distortion or disfluencies. We propose a protocol and two tasks with their elicitation material to allow cleaner long-discourse data, and evaluate the result of a recent test with LSF informants.",{"paper_id":12683,"title":12684,"year":81,"month":855,"day":63,"doi":12685,"resource_url":12686,"first_page":63,"last_page":63,"pdf_url":12687,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12688,"paper_type":860,"authors":12689,"abstract":12696},"lrec2018-main-670","Deep JSLC: A Multimodal Corpus Collection for Data-driven Generation of Japanese Sign Language Expressions","10.63317\u002F45x2kxh2c2e7","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-670","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F470.pdf","brock-nakadai-2018-deep",[12690,12693],{"paper_id":12683,"author_seq":247,"given_name":12691,"surname":12692,"affiliation":63,"orcid":63},"Heike","Brock",{"paper_id":12683,"author_seq":232,"given_name":12694,"surname":12695,"affiliation":63,"orcid":63},"Kazuhiro","Nakadai","The three-dimensional visualization of spoken or written information in Sign Language (SL) is considered a potential tool for better inclusion of deaf or hard of hearing individuals with low literacy skills. However, conventional technologies for such CG-supported data display are not able to depict all relevant features of a natural signing sequence such as facial expression, spatial references or inter-sign movement, leading to poor acceptance amongst speakers of sign language. The deployment of fully data-driven, deep sequence generation models that proved themselves powerful in speech and text applications might overcome this lack of naturalness. Therefore, we collected a corpus of continuous sentence utterances in Japanese Sign Language (JSL) applicable to the learning of deep neural network models. The presented corpus contains multimodal content information of high resolution motion capture data, video data and both visual and gloss-like mark up annotations obtained with the support of fluent JSL signers. Furthermore, all annotations were encoded under three different encoding schemes with respect to directions, intonation and non-manual information. Currently, the corpus is employed to learn first sequence-to-sequence networks where it shows the ability to train relevant language features.",{"paper_id":12698,"title":12699,"year":81,"month":855,"day":63,"doi":12700,"resource_url":12701,"first_page":63,"last_page":63,"pdf_url":12702,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12703,"paper_type":860,"authors":12704,"abstract":12710},"lrec2018-main-671","Modeling French Sign Language: a proposal for a semantically compositional system","10.63317\u002F3mjavn7bk6fz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-671","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F704.pdf","hadjadj-etal-2018-modeling",[12705,12706,12707],{"paper_id":12698,"author_seq":247,"given_name":12679,"surname":12680,"affiliation":63,"orcid":63},{"paper_id":12698,"author_seq":232,"given_name":1780,"surname":12677,"affiliation":63,"orcid":63},{"paper_id":12698,"author_seq":218,"given_name":12708,"surname":12709,"affiliation":63,"orcid":63},"Annelies","Braffort","The recognition of French Sign Language (LSF) as a natural language in 2005 created an important need for the development of tools to make information accessible to the deaf public. With this prospect, the goal of this article is to propose a linguistic approach aimed at modeling the French sign language. We first present the models proposed in computer science to formalize the sign language (SL). We also show the difficulty of applying the grammars originally designed for spoken languages to model SL. In a second step, we propose an approach allowing to take into account the linguistic properties of the SL while respecting the constraints of a modelisation process. By studying the links between semantic functions and their observed forms in Corpus, we have identified several production rules that govern the functioning of the LSF. We finally present the rule functioning as a system capable of modeling an entire utterance in French sign language.",{"paper_id":12712,"title":12713,"year":81,"month":855,"day":63,"doi":12714,"resource_url":12715,"first_page":63,"last_page":63,"pdf_url":12716,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12717,"paper_type":860,"authors":12718,"abstract":12739},"lrec2018-main-672","Construction of the Corpus of Everyday Japanese Conversation: An Interim Report","10.63317\u002F33kpbtjoikgt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-672","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F469.pdf","koiso-etal-2018-construction",[12719,12722,12725,12728,12729,12732,12734,12736],{"paper_id":12712,"author_seq":247,"given_name":12720,"surname":12721,"affiliation":63,"orcid":63},"Hanae","Koiso",{"paper_id":12712,"author_seq":232,"given_name":12723,"surname":12724,"affiliation":63,"orcid":63},"Yasuharu","Den",{"paper_id":12712,"author_seq":218,"given_name":12726,"surname":12727,"affiliation":63,"orcid":63},"Yuriko","Iseki",{"paper_id":12712,"author_seq":203,"given_name":4110,"surname":4111,"affiliation":63,"orcid":63},{"paper_id":12712,"author_seq":188,"given_name":12730,"surname":12731,"affiliation":63,"orcid":63},"Yoshiko","Kawabata",{"paper_id":12712,"author_seq":172,"given_name":12733,"surname":8684,"affiliation":63,"orcid":63},"Ken’ya",{"paper_id":12712,"author_seq":155,"given_name":12735,"surname":6146,"affiliation":63,"orcid":63},"Yayoi",{"paper_id":12712,"author_seq":138,"given_name":12737,"surname":12738,"affiliation":63,"orcid":63},"Yasuyuki","Usuda","In 2016, we launched a new corpus project in which we are building a large-scale corpus of everyday Japanese conversation in a balanced manner, aiming at exploring characteristics of conversations in contemporary Japanese through multiple approaches. The corpus targets various kinds of naturally occurring conversations in daily situations, such as conversations during dinner with the family at home, meetings with colleagues at work, and conversations while driving. In this paper, we first introduce an overview of the corpus, including corpus size, conversation variations, recording methods, structure of the corpus, and annotations to be included in the corpus. Next, we report on the current stage of the development of the corpus and legal and ethical issues discussed so far. Then we present some results of the preliminary evaluation of the data being collected. We focus on whether or not the 94 hours of conversations collected so far vary in a balanced manner by reference to the survey results of everyday conversational behavior that we conducted previously to build an empirical foundation for the corpus design. We will publish the whole corpus in 2022, consisting of more than 200 hours of recordings.",{"paper_id":12741,"title":12742,"year":81,"month":855,"day":63,"doi":12743,"resource_url":12744,"first_page":63,"last_page":63,"pdf_url":12745,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12746,"paper_type":860,"authors":12747,"abstract":12789},"lrec2018-main-673","Carcinologic Speech Severity Index Project: A Database of Speech Disorder Productions to Assess Quality of Life Related to Speech After Cancer","10.63317\u002F4am6cquuoqum","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-673","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F506.pdf","astesano-etal-2018-carcinologic",[12748,12751,12753,12756,12757,12759,12762,12763,12766,12767,12769,12771,12773,12776,12778,12781,12783,12786],{"paper_id":12741,"author_seq":247,"given_name":12749,"surname":12750,"affiliation":63,"orcid":63},"Corine","Astésano",{"paper_id":12741,"author_seq":232,"given_name":5190,"surname":12752,"affiliation":63,"orcid":63},"Balaguer",{"paper_id":12741,"author_seq":218,"given_name":12754,"surname":12755,"affiliation":63,"orcid":63},"Jérôme","Farinas",{"paper_id":12741,"author_seq":203,"given_name":6634,"surname":6635,"affiliation":63,"orcid":63},{"paper_id":12741,"author_seq":188,"given_name":10751,"surname":12758,"affiliation":63,"orcid":63},"Gaillard",{"paper_id":12741,"author_seq":172,"given_name":12760,"surname":12761,"affiliation":63,"orcid":63},"Alain","Ghio",{"paper_id":12741,"author_seq":155,"given_name":6629,"surname":6630,"affiliation":63,"orcid":63},{"paper_id":12741,"author_seq":138,"given_name":12764,"surname":12765,"affiliation":63,"orcid":63},"Muriel","Lalain",{"paper_id":12741,"author_seq":121,"given_name":6238,"surname":2188,"affiliation":63,"orcid":63},{"paper_id":12741,"author_seq":104,"given_name":4515,"surname":12768,"affiliation":63,"orcid":63},"Mauclair",{"paper_id":12741,"author_seq":87,"given_name":869,"surname":12770,"affiliation":63,"orcid":63},"Nocaudie",{"paper_id":12741,"author_seq":73,"given_name":2080,"surname":12772,"affiliation":63,"orcid":63},"Pinquier",{"paper_id":12741,"author_seq":55,"given_name":12774,"surname":12775,"affiliation":63,"orcid":63},"Oriol","Pont",{"paper_id":12741,"author_seq":38,"given_name":10340,"surname":12777,"affiliation":63,"orcid":63},"Pouchoulin",{"paper_id":12741,"author_seq":17,"given_name":12779,"surname":12780,"affiliation":63,"orcid":63},"Michèle","Puech",{"paper_id":12741,"author_seq":2594,"given_name":12782,"surname":3376,"affiliation":63,"orcid":63},"Danièle",{"paper_id":12741,"author_seq":2597,"given_name":12784,"surname":12785,"affiliation":63,"orcid":63},"Etienne","Sicard",{"paper_id":12741,"author_seq":2601,"given_name":12787,"surname":12788,"affiliation":63,"orcid":63},"Virginie","Woisard","Within the framework of the Carcinologic Speech Severity Index (C2SI) InCA Project, we collected a large database of French speech recordings aiming at validating Disorder Severity Indexes. Such a database will be useful for measuring the impact of oral and pharyngeal cavity cancer on speech production. That will permit to assess patients' Quality of Life after treatment. The database is composed of audio recordings from 135 speakers and associated metadata. Several intelligibility and comprehensibility levels of speech functions have been evaluated. Acoustics and Prosody have been assessed. Perceptual evaluation rates from both naive and expert juries are being produced. Automatic analyzes are being carried out. That will provide to speech therapists objective tools to take into account the intelligibility and comprehensibility of patients which received cancer treatment (surgery and\u002For radiotherapy and\u002For chemotherapy). The aim of this paper is to justify the need of this corpus and his data collection. This corpus will be available to the scientific community through the GIS Parolotheque.",{"paper_id":12791,"title":12792,"year":81,"month":855,"day":63,"doi":12793,"resource_url":12794,"first_page":63,"last_page":63,"pdf_url":12795,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12796,"paper_type":860,"authors":12797,"abstract":12810},"lrec2018-main-674","Parallel Corpora in Mboshi (Bantu C25, Congo-Brazzaville)","10.63317\u002F53ftsjpeyepz","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-674","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F635.pdf","rialland-etal-2018-parallel",[12798,12799,12800,12802,12803,12804,12805,12808,12809],{"paper_id":12791,"author_seq":247,"given_name":1019,"surname":10358,"affiliation":63,"orcid":63},{"paper_id":12791,"author_seq":232,"given_name":5170,"surname":5171,"affiliation":63,"orcid":63},{"paper_id":12791,"author_seq":218,"given_name":12801,"surname":10352,"affiliation":63,"orcid":63},"Guy-Noël",{"paper_id":12791,"author_seq":203,"given_name":10340,"surname":10341,"affiliation":63,"orcid":63},{"paper_id":12791,"author_seq":188,"given_name":866,"surname":867,"affiliation":63,"orcid":63},{"paper_id":12791,"author_seq":172,"given_name":5173,"surname":5174,"affiliation":63,"orcid":63},{"paper_id":12791,"author_seq":155,"given_name":12806,"surname":12807,"affiliation":63,"orcid":63},"Elodie","Gauthier",{"paper_id":12791,"author_seq":138,"given_name":1388,"surname":10338,"affiliation":63,"orcid":63},{"paper_id":12791,"author_seq":121,"given_name":10348,"surname":10349,"affiliation":63,"orcid":63},"This article presents multimodal and parallel data collections in Mboshi, as part of the French-German BULB project. It aims at supporting documentation and providing digital resources for less resourced languages with the help of speech and language-based technology. The data collection specifications thus have to meet both field linguists' and computer scientists' requirements, which are large corpora for the latter and linguistically dense data for the former. Beyond speech, the collection comprises pictures and videos documenting social practices, agriculture, wildlife and plants. Visual supports aimed at encouraging people to comment on objects which are meaningful in their daily lives. Speech recordings are composed of the original speech in Mboshi, a respoken version and a translated version to French. These three speech streams remain time-aligned thanks to LIG-AIKUMA, which adds new features to a previous AIKUMA application. The speech corpus includes read material (5k sentences, Bible), verb conjugations and a large part of spontaneous speech (conversations, picture descriptions) resulting in over 50 hours of Mboshi speech, of which 20 hours are already respoken and orally translated to French. These parallel oral data are intended for linguistic documentation (tonology, phonology...) and automatic processing (corpus annotation, alignment between Mboshi speech and French translations).",{"paper_id":12812,"title":12813,"year":81,"month":855,"day":63,"doi":12814,"resource_url":12815,"first_page":63,"last_page":63,"pdf_url":12816,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12817,"paper_type":860,"authors":12818,"abstract":12830},"lrec2018-main-675","A Multimodal Corpus of Expert Gaze and Behavior during Phonetic Segmentation Tasks","10.63317\u002F48ccvuqdcrda","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-675","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F676.pdf","khan-etal-2018-multimodal",[12819,12822,12823,12825,12827],{"paper_id":12812,"author_seq":247,"given_name":12820,"surname":12821,"affiliation":63,"orcid":63},"Arif","Khan",{"paper_id":12812,"author_seq":232,"given_name":9806,"surname":9807,"affiliation":63,"orcid":63},{"paper_id":12812,"author_seq":218,"given_name":6148,"surname":12824,"affiliation":63,"orcid":63},"Sugano",{"paper_id":12812,"author_seq":203,"given_name":3238,"surname":12826,"affiliation":63,"orcid":63},"Bulling",{"paper_id":12812,"author_seq":188,"given_name":12828,"surname":12829,"affiliation":63,"orcid":63},"Ross","Macdonald","Phonetic segmentation is the process of splitting speech into distinct phonetic units. Human experts routinely perform this task manually by analyzing auditory and visual cues using analysis software, which is an extremely time-consuming process. Methods exist for automatic segmentation, but these are not always accurate enough. In order to improve automatic segmentation, we need to model it as close to the manual segmentation as possible. This corpus is an effort to capture the human segmentation behavior by recording experts performing a segmentation task. We believe that this data will enable us to highlight the important aspects of manual segmentation, which can be used in automatic segmentation to improve its accuracy.",{"paper_id":12832,"title":12833,"year":81,"month":855,"day":63,"doi":12834,"resource_url":12835,"first_page":63,"last_page":63,"pdf_url":12836,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12837,"paper_type":860,"authors":12838,"abstract":12846},"lrec2018-main-676","Statistical Analysis of Missing Translation in Simultaneous Interpretation Using A Large-scale Bilingual Speech Corpus","10.63317\u002F4re39nqnizax","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-676","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F683.pdf","cai-etal-2018-statistical",[12839,12842,12843],{"paper_id":12832,"author_seq":247,"given_name":12840,"surname":12841,"affiliation":63,"orcid":63},"Zhongxi","Cai",{"paper_id":12832,"author_seq":232,"given_name":4441,"surname":10765,"affiliation":63,"orcid":63},{"paper_id":12832,"author_seq":218,"given_name":12844,"surname":12845,"affiliation":63,"orcid":63},"Shigeki","Matsubara","This paper describes statistical analyses of missing translations in simultaneous interpretations. Eighty-eight lectures from English-to-Japanese interpretation data from a large-scale bilingual speech corpus were used for the analyses. Word-level alignment was provided manually, and English words without corresponding Japanese words were considered missing translations. The English lectures contained 46,568 content words, 33.1\\% of which were missing in the translation. We analyzed the relationship between missing translations and various factors, including the speech rate of the source language, delay of interpretation, part-of-speech, and depth in the syntactic structure of the source language. The analyses revealed that the proportion of missing translations is high when the speech rate is high and delay is large. We also found that a high proportion of adverbs were missed in the translations, and that words at deeper positions in the syntactic structure were more likely to be missed.",{"paper_id":12848,"title":12849,"year":81,"month":855,"day":63,"doi":12850,"resource_url":12851,"first_page":63,"last_page":63,"pdf_url":12852,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12853,"paper_type":860,"authors":12854,"abstract":12867},"lrec2018-main-677","SynPaFlex-Corpus: An Expressive French Audiobooks Corpus dedicated to expressive speech synthesis.","10.63317\u002F2rdkz74u8h74","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-677","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F723.pdf","sini-etal-2018-synpaflex",[12855,12858,12859,12862,12864],{"paper_id":12848,"author_seq":247,"given_name":12856,"surname":12857,"affiliation":63,"orcid":63},"Aghilas","Sini",{"paper_id":12848,"author_seq":232,"given_name":7058,"surname":7059,"affiliation":63,"orcid":63},{"paper_id":12848,"author_seq":218,"given_name":12860,"surname":12861,"affiliation":63,"orcid":63},"Gaëlle","Vidal",{"paper_id":12848,"author_seq":203,"given_name":4656,"surname":12863,"affiliation":63,"orcid":63},"Tahon",{"paper_id":12848,"author_seq":188,"given_name":12865,"surname":12866,"affiliation":63,"orcid":63},"Élisabeth","Delais-Roussarie","This paper presents an expressive French audiobooks corpus containing eighty seven hours of good audio quality speech, recorded by a single amateur speaker reading audiobooks of different literary genres. This corpus departs from existing corpora collected from audiobooks since they usually provide a few hours of mono-genre and multi-speaker speech. The motivation for setting up such a corpus is to explore expressiveness from different perspectives, such as discourse styles, prosody, and pronunciation, and using different levels of analysis (syllable, prosodic and lexical words, prosodic and syntactic phrases, utterance or paragraph). This will allow developing models to better control expressiveness in speech synthesis, and to adapt pronunciation and prosody to specific discourse settings (changes in discourse perspectives, indirect vs. direct styles, etc.). To this end, the corpus has been annotated automatically and provides information as phone labels, phone boundaries, syllables, words or morpho-syntactic tagging. Moreover, a significant part of the corpus has also been annotated manually to encode direct\u002Findirect speech information and emotional content. The corpus is already usable for studies on prosody and TTS purposes and is available to the community.",{"paper_id":12869,"title":12870,"year":81,"month":855,"day":63,"doi":12871,"resource_url":12872,"first_page":63,"last_page":63,"pdf_url":12873,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12874,"paper_type":860,"authors":12875,"abstract":12878},"lrec2018-main-678","Increasing the Accessibility of Time-Aligned Speech Corpora with Spokes Mix","10.63317\u002F3v5w9jzdi9sp","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-678","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F888.pdf","pezik-2018-increasing",[12876],{"paper_id":12869,"author_seq":247,"given_name":996,"surname":12877,"affiliation":63,"orcid":63},"Pęzik","Spokes Mix is an online service providing access to a number of spoken corpora of Polish, including three newly released time-aligned collections of manually transcribed spoken-conversational data. The purpose of this service is two-fold. Firstly, it functions as a programmatic interface to a number of unique collections of conversational Polish and potentially also spoken corpora of other languages, exposing their full content with complete metadata and annotations. Equally important, however, is its second function of increasing the general accessibility of these resources for research on spoken and conversational language by providing a centralized, easy-to-use corpus query engine with a responsive web-based user interface.",{"paper_id":12880,"title":12881,"year":81,"month":855,"day":63,"doi":12882,"resource_url":12883,"first_page":63,"last_page":63,"pdf_url":12884,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12885,"paper_type":860,"authors":12886,"abstract":12896},"lrec2018-main-679","The MonPaGe_HA Database for the Documentation of Spoken French Throughout Adulthood","10.63317\u002F3hxh7vacuxgm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-679","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F925.pdf","fougeron-etal-2018-monpage",[12887,12889,12891,12893],{"paper_id":12880,"author_seq":247,"given_name":6575,"surname":12888,"affiliation":63,"orcid":63},"Fougeron",{"paper_id":12880,"author_seq":232,"given_name":6092,"surname":12890,"affiliation":63,"orcid":63},"Delvaux",{"paper_id":12880,"author_seq":218,"given_name":6912,"surname":12892,"affiliation":63,"orcid":63},"Ménard",{"paper_id":12880,"author_seq":203,"given_name":12894,"surname":12895,"affiliation":63,"orcid":63},"Marina","Laganaro","Our knowledge of life-span changes in the speech of adults is quite sparse. Existing reports are mainly based on English speakers and few studies have compared more than two extreme age groups. The present paper describes the recently constituted MonPaGe_HealthyAdults database of spoken French including 405 male and female speakers aged from 20 to 93 years old. This database aims at documenting speech throughout adulthood and at building a set of reference values for healthy speakers to be used in clinical assessment of speech. The database is built on five age groups ([20-39], [40-49], [50-59], [60-74], [75+]) and includes 4 regiolects. Speakers have been recorded on a variety of linguistic material and speech tasks in order to cover multiple speech dimensions for each speaker. These cross-sectional data form one of the largest French database available for observing typical changes in the speech of adults as a function of age, and especially in older adults.",{"paper_id":12898,"title":12899,"year":81,"month":855,"day":63,"doi":12900,"resource_url":12901,"first_page":63,"last_page":63,"pdf_url":12902,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12903,"paper_type":860,"authors":12904,"abstract":12910},"lrec2018-main-680","Bringing Order to Chaos: A Non-Sequential Approach for Browsing Large Sets of Found Audio Data","10.63317\u002F532h6xq9yqja","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-680","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1004.pdf","fallgren-etal-2018-bringing",[12905,12906,12907],{"paper_id":12898,"author_seq":247,"given_name":11951,"surname":11952,"affiliation":63,"orcid":63},{"paper_id":12898,"author_seq":232,"given_name":11956,"surname":11957,"affiliation":63,"orcid":63},{"paper_id":12898,"author_seq":218,"given_name":12908,"surname":12909,"affiliation":63,"orcid":63},"Jens","Edlund","We present a novel and general approach for fast and efficient non-sequential browsing of sound in large archives that we know little or nothing about, e.g. so called found data – data not recorded with the specific purpose to be analysed or used as training data. Our main motivation is to address some of the problems speech and speech technology researchers see when they try to capitalise on the huge quantities of speech data that reside in public archives. Our method is a combination of audio browsing through massively multi-object sound environments and a well-known unsupervised dimensionality reduction algorithm (SOM). We test the process chain on four data sets of different nature (speech, speech and music, farm animals, and farm animals mixed with farm sounds). The methods are shown to combine well, resulting in rapid and readily interpretable observations. Finally, our initial results are demonstrated in prototype software which is freely available.",{"paper_id":12912,"title":12913,"year":81,"month":855,"day":63,"doi":12914,"resource_url":12915,"first_page":63,"last_page":63,"pdf_url":12916,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12917,"paper_type":860,"authors":12918,"abstract":12928},"lrec2018-main-681","CoLoSS: Cognitive Load Corpus with Speech and Performance Data from a Symbol-Digit Dual-Task","10.63317\u002F4cjmp52uos3b","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-681","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1008.pdf","herms-etal-2018-coloss",[12919,12921,12923,12926],{"paper_id":12912,"author_seq":247,"given_name":3376,"surname":12920,"affiliation":63,"orcid":63},"Herms",{"paper_id":12912,"author_seq":232,"given_name":2301,"surname":12922,"affiliation":63,"orcid":63},"Wirzberger",{"paper_id":12912,"author_seq":218,"given_name":12924,"surname":12925,"affiliation":63,"orcid":63},"Maximilian","Eibl",{"paper_id":12912,"author_seq":203,"given_name":12927,"surname":11838,"affiliation":63,"orcid":63},"Günter Daniel","In this paper, a new corpus named CoLoSS (Cognitive Load by Speech and performance data in a Symbol-digit dual-task) is presented, which contains speech under cognitive load recorded in a learning task scenario. In order to obtain a reference for cognitive load, a dual-task approach was applied, including a visual-motor primary task that required subjects to learn abstract symbol combinations and an auditory-verbal secondary task to measure the load imposed by the primary task. We report the methodology of collecting the speech recordings, constructing the corpus and describe the properties of the data. Finally, effects of cognitive load on prosodic as well as voice quality features are investigated in conjunction with the corpus. In its current version, the corpus is available to the scientific community, e.g., for exploring the influence of cognitive load on speech or conducting experiments for speech-based cognitive load recognition.",{"paper_id":12930,"title":12931,"year":81,"month":855,"day":63,"doi":12932,"resource_url":12933,"first_page":63,"last_page":63,"pdf_url":12934,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12935,"paper_type":860,"authors":12936,"abstract":12939},"lrec2018-main-682","VAST: A Corpus of Video Annotation for Speech Technologies","10.63317\u002F5hie2f3i3egr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-682","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1015.pdf","tracey-strassel-2018-vast",[12937,12938],{"paper_id":12930,"author_seq":247,"given_name":4518,"surname":5381,"affiliation":63,"orcid":63},{"paper_id":12930,"author_seq":232,"given_name":1205,"surname":5377,"affiliation":63,"orcid":63},"The Video Annotation for Speech Technologies (VAST) corpus contains approximately 2900 hours of video data collected and labeled to support the development of speech technologies such as speech activity detection, language identification, speaker identification, and speech recognition. The bulk of the data comes from amateur video content harvested from the web. Collection was designed to ensure that the videos cover a diverse range of communication domains, data sources and video resolutions and to include three primary languages (English, Mandarin Chinese and Arabic) plus supplemental data in 7 additional languages\u002Fdialects to support language recognition research. Portions of the collected data were annotated for speech activity, speaker identity, speaker sex, language identification, diarization, and transcription. A description of the data collection and each of the annotation types is presented in this paper. The corpus represents a challenging data set for language technology development due to the informal nature of the majority of the data, as well as the variety of languages, noise conditions, topics, and speakers present in the collection.",{"paper_id":12941,"title":12942,"year":81,"month":855,"day":63,"doi":12943,"resource_url":12944,"first_page":63,"last_page":63,"pdf_url":12945,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12946,"paper_type":860,"authors":12947,"abstract":12959},"lrec2018-main-683","Edit me: A Corpus and a Framework for Understanding Natural Language Image Editing","10.63317\u002F596xzwsitvsy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-683","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F481.pdf","manuvinakurike-etal-2018-edit",[12948,12951,12952,12953,12954,12955,12956],{"paper_id":12941,"author_seq":247,"given_name":12949,"surname":12950,"affiliation":63,"orcid":63},"Ramesh","Manuvinakurike",{"paper_id":12941,"author_seq":232,"given_name":10376,"surname":10377,"affiliation":63,"orcid":63},{"paper_id":12941,"author_seq":218,"given_name":8763,"surname":8764,"affiliation":63,"orcid":63},{"paper_id":12941,"author_seq":203,"given_name":8558,"surname":2069,"affiliation":63,"orcid":63},{"paper_id":12941,"author_seq":188,"given_name":8766,"surname":1104,"affiliation":63,"orcid":63},{"paper_id":12941,"author_seq":172,"given_name":1208,"surname":1209,"affiliation":63,"orcid":63},{"paper_id":12941,"author_seq":155,"given_name":12957,"surname":12958,"affiliation":63,"orcid":63},"Kallirroi","Georgila","This paper introduces the task of interacting with an image editing program through natural language. We present a corpus of image edit requests which were elicited for real world images, and an annotation framework for understanding such natural language instructions and mapping them to actionable computer commands. Finally, we evaluate crowd-sourced annotation as a means of efficiently creating a sizable corpus at a reasonable cost.",{"paper_id":12961,"title":12962,"year":81,"month":855,"day":63,"doi":12963,"resource_url":12964,"first_page":63,"last_page":63,"pdf_url":12965,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12966,"paper_type":860,"authors":12967,"abstract":12974},"lrec2018-main-684","Enriching a Lexicon of Discourse Connectives with Corpus-based Data","10.63317\u002F4swxgpd4mt4w","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-684","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F528.pdf","feltracco-etal-2018-enriching",[12968,12970,12973],{"paper_id":12961,"author_seq":247,"given_name":884,"surname":12969,"affiliation":63,"orcid":63},"Feltracco",{"paper_id":12961,"author_seq":232,"given_name":12971,"surname":12972,"affiliation":63,"orcid":63},"Elisabetta","Jezek",{"paper_id":12961,"author_seq":218,"given_name":2542,"surname":2543,"affiliation":63,"orcid":63},"We present the results of the effort of enriching the pre-existing resource LICO, a Lexicon of Italian COnnectives retrieved from lexicographic sources (Feltracco et al., 2016), with real corpus data for connectives marking contrast relations in text. The motivation beyond our effort is that connectives can only be interpreted when they appear in context, that is, in a relation between the two fragments of text that constitute the two arguments of the relation. In this perspective, adding corpus examples annotated with connectives and arguments for the relation allows us to both extend the resource and validate the lexicon. In order to retrieve good corpus examples, we take advantage of the existing Contrast-Ita Bank (Feltracco et al., 2017), a corpus of news annotated with explicit and implicit discourse contrast relations for Italian according to the annotation scheme proposed in the Penn Discourse Tree Bank (PDTB) guidelines (Prasad et al., 2007). We also use an extended -non contrast annotated- version of the same corpus and documents from Wikipedia. The resulting resource represents a valuable tool for both linguistic analyses of discourse relations and the training of a classifier for NLP applications.",{"paper_id":12976,"title":12977,"year":81,"month":855,"day":63,"doi":12978,"resource_url":12979,"first_page":63,"last_page":63,"pdf_url":12980,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12981,"paper_type":860,"authors":12982,"abstract":12986},"lrec2018-main-685","SimPA: A Sentence-Level Simplification Corpus for the Public Administration Domain","10.63317\u002F53kcf9yt95sy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-685","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F542.pdf","scarton-etal-2018-simpa",[12983,12984,12985],{"paper_id":12976,"author_seq":247,"given_name":10712,"surname":10713,"affiliation":63,"orcid":63},{"paper_id":12976,"author_seq":232,"given_name":10715,"surname":10716,"affiliation":63,"orcid":63},{"paper_id":12976,"author_seq":218,"given_name":10718,"surname":10719,"affiliation":63,"orcid":63},"We present a sentence-level simplification corpus with content from the Public Administration (PA) domain. The corpus contains 1,100 original sentences with manual simplifications collected through a two-stage process. Firstly, annotators were asked to simplify only words and phrases (lexical simplification). Each sentence was simplified by three annotators. Secondly, one lexically simplified version of each original sentence was further simplified at the syntactic level. In its current version there are 3,300 lexically simplified sentences plus 1,100 syntactically simplified sentences. The corpus will be used for evaluation of text simplification approaches in the scope of the EU H2020 SIMPATICO project - which focuses on accessibility of e-services in the PA domain - and beyond. The main advantage of this corpus is that lexical and syntactic simplifications can be analysed and used in isolation. The lexically simplified corpus is also multi-reference (three different simplifications per original sentence). This is an ongoing effort and our final aim is to collect manual simplifications for the entire set of original sentences, with over 10K sentences.",{"paper_id":12988,"title":12989,"year":81,"month":855,"day":63,"doi":12990,"resource_url":12991,"first_page":63,"last_page":63,"pdf_url":12992,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":12993,"paper_type":860,"authors":12994,"abstract":13004},"lrec2018-main-686","The brWaC Corpus: A New Open Resource for Brazilian Portuguese","10.63317\u002F3kpvgaisx8v2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-686","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F599.pdf","wagner-filho-etal-2018-brwac",[12995,12998,12999,13001],{"paper_id":12988,"author_seq":247,"given_name":12996,"surname":12997,"affiliation":63,"orcid":63},"Jorge A.","Wagner Filho",{"paper_id":12988,"author_seq":232,"given_name":1953,"surname":1954,"affiliation":63,"orcid":63},{"paper_id":12988,"author_seq":218,"given_name":922,"surname":13000,"affiliation":63,"orcid":63},"Idiart",{"paper_id":12988,"author_seq":203,"given_name":13002,"surname":13003,"affiliation":63,"orcid":63},"Aline","Villavicencio","In this work, we present the construction process of a large Web corpus for Brazilian Portuguese, aiming to achieve a size comparable to the state of the art in other languages. We also discuss our updated sentence-level approach for the strict removal of duplicated content. Following the pipeline methodology, more than 60 million pages were crawled and filtered, with  3.5 million being selected. The obtained multi-domain corpus, named brWaC, is composed by 2.7 billion tokens, and has been annotated with tagging and parsing information. The incidence of non-unique long sentences, an indication of replicated content, which reaches 9% in other Web corpora, was reduced to only 0.5%. Domain diversity was also maximized, with 120,000 different websites contributing content. We are making our new resource freely available for the research community, both for querying and downloading, in the expectation of aiding in new advances for the processing of Brazilian Portuguese.",{"paper_id":13006,"title":13007,"year":81,"month":855,"day":63,"doi":13008,"resource_url":13009,"first_page":63,"last_page":63,"pdf_url":13010,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13011,"paper_type":860,"authors":13012,"abstract":13018},"lrec2018-main-687","Czech Text Document Corpus v 2.0","10.63317\u002F3ze4fotp6x6s","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-687","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F671.pdf","kral-lenc-2018-czech",[13013,13015],{"paper_id":13006,"author_seq":247,"given_name":4670,"surname":13014,"affiliation":63,"orcid":63},"Král",{"paper_id":13006,"author_seq":232,"given_name":13016,"surname":13017,"affiliation":63,"orcid":63},"Ladislav","Lenc","This paper introduces \"Czech Text Document Corpus v 2.0\", a collection of text documents for automatic document classification in Czech language. It is composed of the text documents provided by the Czech News Agency and is freely available for research purposes at http:\u002F\u002Fctdc.kiv.zcu.cz\u002F. This corpus was created in order to facilitate a straightforward comparison of the document classification approaches on Czech data. It is particularly dedicated to evaluation of multi-label document classification approaches, because one document is usually labelled with more than one label. Besides the information about the document classes, the corpus is also annotated at the morphological layer. This paper further shows the results of selected state-of-the-art methods on this corpus to offer the possibility of an easy comparison with these approaches.",{"paper_id":13020,"title":13021,"year":81,"month":855,"day":63,"doi":13022,"resource_url":13023,"first_page":63,"last_page":63,"pdf_url":13024,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13025,"paper_type":860,"authors":13026,"abstract":13034},"lrec2018-main-688","Corpora of Typical Sentences","10.63317\u002F4wdznxjqqfz2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-688","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F724.pdf","muller-etal-2018-corpora",[13027,13029,13032],{"paper_id":13020,"author_seq":247,"given_name":13028,"surname":6936,"affiliation":63,"orcid":63},"Lydia",{"paper_id":13020,"author_seq":232,"given_name":13030,"surname":13031,"affiliation":63,"orcid":63},"Uwe","Quasthoff",{"paper_id":13020,"author_seq":218,"given_name":2032,"surname":13033,"affiliation":63,"orcid":63},"Sumalvico","Typical sentences of characteristic syntactic structures can be used for language understanding tasks like finding typical slotfiller for verbs. The paper describes the selection of such typical sentences representing usually about 5% of the original corpus. The sentences are selected by the frequency of the corresponding POS tag sequence together with an entropy theshold, and the selection method is shown to work language independently. Entropy measuring the distribution of words in a given position turns out to identify larger sets of near-duplicate sentences, not considered typical. A statistical comparison of those subcorpora with the underlying corpus shows the intended shorter sentence length, but also a decrease of word frequencies for function words associated to more complex sentences.",{"paper_id":13036,"title":13037,"year":81,"month":855,"day":63,"doi":13038,"resource_url":13039,"first_page":63,"last_page":63,"pdf_url":13040,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13041,"paper_type":860,"authors":13042,"abstract":13050},"lrec2018-main-689","The German Reference Corpus DeReKo: New Developments – New Opportunities","10.63317\u002F2v7vv6rgq6fh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-689","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F737.pdf","kupietz-etal-2018-german",[13043,13045,13048,13049],{"paper_id":13036,"author_seq":247,"given_name":4664,"surname":13044,"affiliation":63,"orcid":63},"Kupietz",{"paper_id":13036,"author_seq":232,"given_name":13046,"surname":13047,"affiliation":63,"orcid":63},"Harald","Lüngen",{"paper_id":13036,"author_seq":218,"given_name":2029,"surname":1315,"affiliation":63,"orcid":63},{"paper_id":13036,"author_seq":203,"given_name":3238,"surname":7574,"affiliation":63,"orcid":63},"This paper discusses current trends in DeReKo, the German Reference Corpus, concerning legal issues around the recent German copyright reform with positive implications for corpus building and corpus linguistics in general, recent corpus extensions in the genres of popular magazines, journals, historical texts, and web-based football reports. Besides, DeReKo is finally accessible via the new corpus research platform KorAP, offering registered users several news features in comparison with its predecessor COSMAS II.",{"paper_id":13052,"title":13053,"year":81,"month":855,"day":63,"doi":13054,"resource_url":13055,"first_page":63,"last_page":63,"pdf_url":13056,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13057,"paper_type":860,"authors":13058,"abstract":13071},"lrec2018-main-690","Risamálheild: A Very Large Icelandic Text Corpus","10.63317\u002F2ikwpd53aott","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-690","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F746.pdf","steingrimsson-etal-2018-risamalheild",[13059,13062,13064,13067,13070],{"paper_id":13052,"author_seq":247,"given_name":13060,"surname":13061,"affiliation":63,"orcid":63},"Steinþór","Steingrímsson",{"paper_id":13052,"author_seq":232,"given_name":13063,"surname":9715,"affiliation":63,"orcid":63},"Sigrún",{"paper_id":13052,"author_seq":218,"given_name":13065,"surname":13066,"affiliation":63,"orcid":63},"Eiríkur","Rögnvaldsson",{"paper_id":13052,"author_seq":203,"given_name":13068,"surname":13069,"affiliation":63,"orcid":63},"Starkaður","Barkarson",{"paper_id":13052,"author_seq":188,"given_name":9720,"surname":9721,"affiliation":63,"orcid":63},"We present Risamálheild, the Icelandic Gigaword Corpus (IGC), a corpus containing more than one billion running words from mostly contemporary texts. The work was carried out with minimal amount of work and resources, focusing on material that is not protected by copyright and sources which could provide us with large chunks of text for each cleared permission. The two main sources considered were therefore official texts and texts from news media. Only digitally available texts are included in the corpus and formats that can be problematic are not processed. The corpus texts are morphosyntactically tagged and provided with metadata. Processes have been set up for continuous text collection, cleaning and annotation. The corpus is available for search and download with permissive licenses. The dataset is intended to be clearly versioned with the first version released in early 2018. Texts will be collected continually and a new version published every year.",{"paper_id":13073,"title":13074,"year":81,"month":855,"day":63,"doi":13075,"resource_url":13076,"first_page":63,"last_page":63,"pdf_url":13077,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13078,"paper_type":860,"authors":13079,"abstract":13088},"lrec2018-main-691","TriMED: A Multilingual Terminological Database","10.63317\u002F385sezybq6wb","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-691","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F715.pdf","vezzani-etal-2018-trimed",[13080,13082,13085],{"paper_id":13073,"author_seq":247,"given_name":12257,"surname":13081,"affiliation":63,"orcid":63},"Vezzani",{"paper_id":13073,"author_seq":232,"given_name":13083,"surname":13084,"affiliation":63,"orcid":63},"Giorgio Maria","Di Nunzio",{"paper_id":13073,"author_seq":218,"given_name":13086,"surname":13087,"affiliation":63,"orcid":63},"Geneviève","Henrot","Three precise categories of people are confronted with the complexity of medical language: physicians, patients and scientific translators. The purpose of this work is to develop a methodology for the implementation of a terminological tool that contributes to solve problems related to the opacity that characterizes communication in the medical field among its various actors. The main goals are: i) satisfy the peer-to-peer communication, ii) facilitate the comprehension of medical information by patients, and iii) provide a regularly updated resource for scientific translators.  We illustrate our methodology and its application through the description of a multilingual terminological-phraseological resource named TriMED. This terminological database will consist of records designed to create a terminological bridge between the various registers (specialist, semi-specialist, non-specialist) as well as across the languages considered. In this initial analysis, we restricted to the field of breast cancer, and the terms to be analyzed will be extracted from a corpus in English, accompanied by all relevant linguistic information and properties, and re-attached to their pragmatic equivalent in Italian and French.",{"paper_id":13090,"title":13091,"year":81,"month":855,"day":63,"doi":13092,"resource_url":13093,"first_page":63,"last_page":63,"pdf_url":13094,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13095,"paper_type":860,"authors":13096,"abstract":13105},"lrec2018-main-692","Preparation and Usage of Xhosa Lexicographical Data for a Multilingual, Federated Environment","10.63317\u002F2en3enuhx9w5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-692","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F803.pdf","bosch-etal-2018-preparation",[13097,13100,13101,13102,13104],{"paper_id":13090,"author_seq":247,"given_name":13098,"surname":13099,"affiliation":63,"orcid":63},"Sonja","Bosch",{"paper_id":13090,"author_seq":232,"given_name":1615,"surname":7190,"affiliation":63,"orcid":63},{"paper_id":13090,"author_seq":218,"given_name":7844,"surname":7845,"affiliation":63,"orcid":63},{"paper_id":13090,"author_seq":203,"given_name":1924,"surname":13103,"affiliation":63,"orcid":63},"Goldhahn",{"paper_id":13090,"author_seq":188,"given_name":13030,"surname":13031,"affiliation":63,"orcid":63},"The South African linguistic landscape is characterised by multilingualism and the influence between their eleven official and some local languages. Unfortunately, for most of the languages the amount and quality of available lexicographical data is suboptimal, even though its availability is essential for all educational institutions and for the development of state-of-the-art language technology. In this paper we present a new source of lexicographical data for Xhosa, a language spoken by more than eight million speakers. For its utilisation in a multilingual and federated environment it is modelled using a dedicated OWL ontology for Bantu languages and possesses all features that are currently considered integral for the promotion of resource reuse as well as long-term usage. In the future, the introduced ontology may be used for other Bantu languages as well and may ease their combination to achieve more extensive, multilingual data stocks.",{"paper_id":13107,"title":13108,"year":81,"month":855,"day":63,"doi":13109,"resource_url":13110,"first_page":63,"last_page":63,"pdf_url":13111,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13112,"paper_type":860,"authors":13113,"abstract":13120},"lrec2018-main-693","A Lexicon of Discourse Markers for Portuguese – LDM-PT","10.63317\u002F3pcxdc5fv3f5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-693","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F814.pdf","mendes-etal-2018-lexicon",[13114,13115,13117,13118],{"paper_id":13107,"author_seq":247,"given_name":3893,"surname":3894,"affiliation":63,"orcid":63},{"paper_id":13107,"author_seq":232,"given_name":12344,"surname":13116,"affiliation":63,"orcid":63},"del Rio",{"paper_id":13107,"author_seq":218,"given_name":1058,"surname":5633,"affiliation":63,"orcid":63},{"paper_id":13107,"author_seq":203,"given_name":1211,"surname":13119,"affiliation":63,"orcid":63},"Dombek","We present LDM-PT, a lexicon of discourse markers for European Portuguese, composed of 252 pairs of discourse marker\u002Frhetorical sense. The lexicon covers conjunctions, prepositions, adverbs, adverbial phrases and alternative lexicalizations with a connective function, as in the PDTB (Prasad et al., 2008; Prasad et al., 2010). For each discourse marker in the lexicon, there is information regarding its type, category, mood and tense restrictions over the sentence it introduces, rhetorical sense, following the PDTB 3.0 sense hierarchy (Webber et al., 2016), as well as a link to an English near-synonym and a corpus example. The lexicon is compiled in a single excel spread sheet that is later converted to an XML scheme compatible with the DiMLex format (Stede, 2002). We give a detailed description of the contents and format of the lexicon, and discuss possible applications of this resource for discourse studies and discourse processing tools for Portuguese.",{"paper_id":13122,"title":13123,"year":81,"month":855,"day":63,"doi":13124,"resource_url":13125,"first_page":63,"last_page":63,"pdf_url":13126,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13127,"paper_type":860,"authors":13128,"abstract":13135},"lrec2018-main-694","One Language to rule them all: modelling Morphological Patterns in a Large Scale Italian Lexicon with SWRL","10.63317\u002F32tzxgk89a8o","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-694","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F844.pdf","khan-etal-2018-one",[13129,13130,13132,13134],{"paper_id":13122,"author_seq":247,"given_name":4090,"surname":12821,"affiliation":63,"orcid":63},{"paper_id":13122,"author_seq":232,"given_name":2516,"surname":13131,"affiliation":63,"orcid":63},"Bellandi",{"paper_id":13122,"author_seq":218,"given_name":7739,"surname":13133,"affiliation":63,"orcid":63},"Frontini",{"paper_id":13122,"author_seq":203,"given_name":2653,"surname":2654,"affiliation":63,"orcid":63},"We present an application of Semantic Web Technologies to computational lexicography. More precisely we describe the publication of the \\textit{morphological layer} of the Italian Parole Simple Clips lexicon (PSC-M) as linked open data. The novelty of our work is in the use of the Semantic Web Rule Language (SWRL) to encode morphological patterns, thereby allowing the automatic derivation of the inflectional variants of the entries in the lexicon. By doing so we make these patterns available in a form that is human readable and that therefore gives a comprehensive morphological description of a large number of Italian words.",{"paper_id":13137,"title":13138,"year":81,"month":855,"day":63,"doi":13139,"resource_url":13140,"first_page":63,"last_page":63,"pdf_url":13141,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13142,"paper_type":860,"authors":13143,"abstract":13145},"lrec2018-main-695","Metaphor Suggestions based on a Semantic Metaphor Repository","10.63317\u002F56koyge6q27j","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-695","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F967.pdf","de-melo-2018-metaphor",[13144],{"paper_id":13137,"author_seq":247,"given_name":1037,"surname":1038,"affiliation":63,"orcid":63},"Metaphors are not only remarkably pervasive in both informal and formal text, but reflect fundamental properties of human cognition. This paper presents an algorithmic model that suggests metaphoric means of referring to concepts. For example, given a word such as \"government\", the method may propose expressions such as \"father\", \"nanny\", corresponding to common ways of thinking about the government. To achieve this, the model draws on MetaNet, a manually created repository of conceptual metaphor, in conjunction with lexical resources like FrameNet and WordNet and automated interlinking techniques. These resources are connected and their overall graph structure allows us to propose potential metaphoric means of referring to the given input words, possibly constrained with additional metaphoric seed words, which may be provided as supplementary inputs. The experiments show that this algorithm greatly expands the potential of the repository.",{"paper_id":13147,"title":13148,"year":81,"month":855,"day":63,"doi":13149,"resource_url":13150,"first_page":63,"last_page":63,"pdf_url":13151,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13152,"paper_type":860,"authors":13153,"abstract":13160},"lrec2018-main-696","The Linguistic Category Model in Polish (LCM-PL)","10.63317\u002F35kb7ardwyek","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-696","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1019.pdf","wawer-sarzynska-2018-linguistic",[13154,13157],{"paper_id":13147,"author_seq":247,"given_name":13155,"surname":13156,"affiliation":63,"orcid":63},"Aleksander","Wawer",{"paper_id":13147,"author_seq":232,"given_name":13158,"surname":13159,"affiliation":63,"orcid":63},"Justyna","Sarzyńska","This article describes the first public release of Linguistic Category Model (LCM) dictionary for the Polish language (LCM-PL). It is used for verb categorization in terms of their abstractness and applied in many research scenarios, mostly in psychology. The dictionary consists of three distinctive parts: (1) sense-level manual annotation, (2) lexeme-level manual annotation, (3) lexeme-level automated annotation. The part (1) is of high quality yet the most expensive to obtain, therefore we complement it with options (2) and (3) to generate LCM labels for all verbs in Polish. Our dictionary is freely available for use and integrated with Słowosieć 3.0 (the Polish WordNet). Its quality will improve: we'll add more manually annotated senses and increase the quality of automated annotations.",{"paper_id":13162,"title":13163,"year":81,"month":855,"day":63,"doi":13164,"resource_url":13165,"first_page":63,"last_page":63,"pdf_url":13166,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13167,"paper_type":860,"authors":13168,"abstract":13173},"lrec2018-main-697","WordNet-Shp: Towards the Building of a Lexical Database for a Peruvian Minority Language","10.63317\u002F2q4mwekfxxi6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-697","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1044.pdf","maguino-valencia-etal-2018-wordnet",[13169,13171,13172],{"paper_id":13162,"author_seq":247,"given_name":7945,"surname":13170,"affiliation":63,"orcid":63},"Maguiño-Valencia",{"paper_id":13162,"author_seq":232,"given_name":12095,"surname":12566,"affiliation":63,"orcid":63},{"paper_id":13162,"author_seq":218,"given_name":4037,"surname":12568,"affiliation":63,"orcid":63},"WordNet-like resources are lexical databases with highly relevance information and data which could be exploited in more complex computational linguistics research and applications. The building process requires manual and automatic tasks, that could be more arduous if the language is a minority one with fewer digital resources. This study focuses in the construction of an initial WordNet database for a low-resourced and indigenous language in Peru: Shipibo-Konibo (shp). First, the stages of development from a scarce scenario (a bilingual dictionary shp-es) are described. Then, it is proposed a synset alignment method by comparing the definition glosses in the dictionary (written in Spanish) with the content of a Spanish WordNet. In this sense, word2vec similarity was the chosen metric for the proximity measure. Finally, an evaluation process is performed for the synsets, using a manually annotated Gold Standard in Shipibo-Konibo. The obtained results are promising, and this resource is expected to serve well in further applications, such as word sense disambiguation and even machine translation in the shp-es language pair.",{"paper_id":13175,"title":13176,"year":81,"month":855,"day":63,"doi":13177,"resource_url":13178,"first_page":63,"last_page":63,"pdf_url":13179,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13180,"paper_type":860,"authors":13181,"abstract":13189},"lrec2018-main-698","Retrieving Information from the French Lexical Network in RDF\u002FOWL Format","10.63317\u002F57xwzvstuxxf","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-698","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1102.pdf","fonseca-etal-2018-retrieving",[13182,13185,13188],{"paper_id":13175,"author_seq":247,"given_name":13183,"surname":13184,"affiliation":63,"orcid":63},"Alexsandro","Fonseca",{"paper_id":13175,"author_seq":232,"given_name":13186,"surname":13187,"affiliation":63,"orcid":63},"Fatiha","Sadat",{"paper_id":13175,"author_seq":218,"given_name":3525,"surname":9424,"affiliation":63,"orcid":63},"In this paper, we present a Java API to retrieve the lexical information from the French Lexical Network, a lexical resource based on the Meaning-Text Theory’s lexical functions, which was previously transformed to an RDF\u002FOWL format. We present four API functions: one that returns all the lexical relations between two given vocables; one that returns all the lexical relations and the lexical functions modeling those relations for two given vocables; one that returns all the lexical relations encoded in the lexical network modeled by a specific lexical function; and one that returns the semantic perspectives for a specific lexical function. This API was used in the identification of collocations in a French corpus of 1.8 million sentences and in the semantic classification of these collocations.",{"paper_id":13191,"title":13192,"year":81,"month":855,"day":63,"doi":13193,"resource_url":13194,"first_page":63,"last_page":63,"pdf_url":13195,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13196,"paper_type":860,"authors":13197,"abstract":13202},"lrec2018-main-699","Transforming Wikipedia into a Large-Scale Fine-Grained Entity Type Corpus","10.63317\u002F2zjrem3tbunr","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-699","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F11.pdf","ghaddar-langlais-2018-transforming",[13198,13201],{"paper_id":13191,"author_seq":247,"given_name":13199,"surname":13200,"affiliation":63,"orcid":63},"Abbas","Ghaddar",{"paper_id":13191,"author_seq":232,"given_name":2089,"surname":6792,"affiliation":63,"orcid":63},"This paper presents WiFiNE, an English corpus annotated with fine-grained entity types. We propose simple but effective heuristics we applied to English Wikipedia to build a large, high quality, annotated corpus. We evaluate the impact of our corpus on the fine-grained entity typing system of Shimaoka et al. (2017), with 2 manually annotated benchmarks, FIGER (GOLD) and ONTONOTES . We report state-of-the-art performances, with a gain of 0.8 micro F1 score on the former dataset and a gain of 2.7 macro F1 score on the latter one, despite the fact that we employ the same quantity of training data used in previous works. We make our corpus available as a resource for future works.",{"paper_id":13204,"title":13205,"year":81,"month":855,"day":63,"doi":13206,"resource_url":13207,"first_page":63,"last_page":63,"pdf_url":13208,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13209,"paper_type":860,"authors":13210,"abstract":13228},"lrec2018-main-700","Error Analysis of Uyghur Name Tagging: Language-specific Techniques and Remaining Challenges","10.63317\u002F4oqn93a5zuqv","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-700","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F12.pdf","abudukelimu-etal-2018-error",[13211,13214,13217,13219,13222,13225,13227],{"paper_id":13204,"author_seq":247,"given_name":13212,"surname":13213,"affiliation":63,"orcid":63},"Halidanmu","Abudukelimu",{"paper_id":13204,"author_seq":232,"given_name":13215,"surname":13216,"affiliation":63,"orcid":63},"Abudoukelimu","Abulizi",{"paper_id":13204,"author_seq":218,"given_name":13218,"surname":2381,"affiliation":63,"orcid":63},"Boliang",{"paper_id":13204,"author_seq":203,"given_name":13220,"surname":13221,"affiliation":63,"orcid":63},"Xiaoman","Pan",{"paper_id":13204,"author_seq":188,"given_name":13223,"surname":13224,"affiliation":63,"orcid":63},"Di","Lu",{"paper_id":13204,"author_seq":172,"given_name":13226,"surname":5953,"affiliation":63,"orcid":63},"Heng",{"paper_id":13204,"author_seq":155,"given_name":3503,"surname":5232,"affiliation":63,"orcid":63},"Regardless of numerous efforts at name tagging for Uyghur, there is limited understanding on the performance ceiling. In this paper, we take a close look at the successful cases and perform careful analysis on the remaining errors of a state-of-the-art Uyghur name tagger, systematically categorize challenges, and propose possible solutions. We conclude that simply adopting a machine learning model which is proven successful for high-resource languages along with language-independent superficial features is unlikely to be effective for Uyghur, or low-resource languages in general. Further advancement requires exploiting rich language-specific knowledge and non-traditional linguistic resources, and novel methods to encode them into machine learning frameworks.",{"paper_id":13230,"title":13231,"year":81,"month":855,"day":63,"doi":13232,"resource_url":13233,"first_page":63,"last_page":63,"pdf_url":13234,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13235,"paper_type":860,"authors":13236,"abstract":13242},"lrec2018-main-701","BiLSTM-CRF for Persian Named-Entity Recognition ArmanPersoNERCorpus: the First Entity-Annotated Persian Dataset","10.63317\u002F2oemthbv7w4w","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-701","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F48.pdf","poostchi-etal-2018-bilstm",[13237,13240,13241],{"paper_id":13230,"author_seq":247,"given_name":13238,"surname":13239,"affiliation":63,"orcid":63},"Hanieh","Poostchi",{"paper_id":13230,"author_seq":232,"given_name":3542,"surname":3543,"affiliation":63,"orcid":63},{"paper_id":13230,"author_seq":218,"given_name":3545,"surname":3546,"affiliation":63,"orcid":63},"Named-entity recognition (NER) can still be regarded as work in progress for a number of Asian languages due to the scarcity of annotated corpora. For this reason, with this paper we publicly release an entity-annotated Persian dataset and we present a performing approach for Persian NER based on a deep learning architecture. In addition to the entity-annotated dataset, we release a number of word embeddings (including GloVe, skip-gram, CBOW and Hellinger PCA) trained on a sizable collation of Persian text. The combination of the deep learning architecture (a BiLSTM-CRF) and the pre-trained word embeddings has allowed us to achieve a 77.45% CoNLL F1 score, a result that is more than 12 percentage points higher than the best previous result and interesting in absolute terms.",{"paper_id":13244,"title":13245,"year":81,"month":855,"day":63,"doi":13246,"resource_url":13247,"first_page":63,"last_page":63,"pdf_url":13248,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13249,"paper_type":860,"authors":13250,"abstract":13256},"lrec2018-main-702","Data Anonymization for Requirements Quality Analysis: a Reproducible Automatic Error Detection Task","10.63317\u002F2ocvqiebj95w","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-702","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F72.pdf","kang-park-2018-data",[13251,13254],{"paper_id":13244,"author_seq":247,"given_name":13252,"surname":13253,"affiliation":63,"orcid":63},"Juyeon","Kang",{"paper_id":13244,"author_seq":232,"given_name":13255,"surname":5615,"affiliation":63,"orcid":63},"Jungyeul","In this work, we aim at identifying potential problems of ambiguity, completeness, conformity, singularity and readability in system and software requirements specifications.  Those problems arise particularly when they are written in a natural language.  While we describe them from a linguistic point of view, the business impacts of each potential error are also considered in system engineering context.  We investigate and explore error patterns for requirements quality analysis by manually analyzing the corpus. This analysis is based on the requirements grammar that we developed in our previous work.  In addition, this paper extends our previous work in a two-fold way: (1) we increase more than twice the number of evaluation data (1K sentences) through a manual verification process, and (2) we anonymize all sensible and confidential entities in evaluation data to make our data publicly available. We also provide the baseline system using conditional random fields for requirements quality analysis, and we obtain 79.47\\% for the F$_1$ score on proposed evaluation data.",{"paper_id":13258,"title":13259,"year":81,"month":855,"day":63,"doi":13260,"resource_url":13261,"first_page":63,"last_page":63,"pdf_url":13262,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13263,"paper_type":860,"authors":13264,"abstract":13279},"lrec2018-main-703","A German Corpus for Fine-Grained Named Entity Recognition and Relation Extraction of Traffic and Industry Events","10.63317\u002F4doxr7gqxhfy","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-703","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F85.pdf","schiersch-etal-2018-german",[13265,13267,13270,13272,13273,13276],{"paper_id":13258,"author_seq":247,"given_name":1248,"surname":13266,"affiliation":63,"orcid":63},"Schiersch",{"paper_id":13258,"author_seq":232,"given_name":13268,"surname":13269,"affiliation":63,"orcid":63},"Veselina","Mironova",{"paper_id":13258,"author_seq":218,"given_name":12924,"surname":13271,"affiliation":63,"orcid":63},"Schmitt",{"paper_id":13258,"author_seq":203,"given_name":2089,"surname":1615,"affiliation":63,"orcid":63},{"paper_id":13258,"author_seq":188,"given_name":13274,"surname":13275,"affiliation":63,"orcid":63},"Aleksandra","Gabryszak",{"paper_id":13258,"author_seq":172,"given_name":13277,"surname":13278,"affiliation":63,"orcid":63},"Leonhard","Hennig","Monitoring mobility- and industry-relevant events is important in areas such as personal travel planning and supply chain management, but extracting events pertaining to specific companies, transit routes and locations from heterogeneous, high-volume text streams remains a significant challenge. This work describes a corpus of German-language documents which has been annotated with fine-grained geo-entities, such as streets, stops and routes, as well as standard named entity types. It has also been annotated with a set of 15 traffic- and industry-related n-ary relations and events, such as accidents, traffic jams, acquisitions, and strikes. The corpus consists of newswire texts, Twitter messages, and traffic reports from radio stations, police and railway companies. It allows for training and evaluating both named entity recognition algorithms that aim for fine-grained typing of geo-entities, as well as n-ary relation extraction systems.",{"paper_id":13281,"title":13282,"year":81,"month":855,"day":63,"doi":13283,"resource_url":13284,"first_page":63,"last_page":63,"pdf_url":13285,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13286,"paper_type":860,"authors":13287,"abstract":13294},"lrec2018-main-704","A Corpus Study and Annotation Schema for Named Entity Recognition and Relation Extraction of Business Products","10.63317\u002F32dg89jmdnmd","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-704","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F88.pdf","schon-etal-2018-corpus",[13288,13291,13292,13293],{"paper_id":13281,"author_seq":247,"given_name":13289,"surname":13290,"affiliation":63,"orcid":63},"Saskia","Schön",{"paper_id":13281,"author_seq":232,"given_name":13268,"surname":13269,"affiliation":63,"orcid":63},{"paper_id":13281,"author_seq":218,"given_name":13274,"surname":13275,"affiliation":63,"orcid":63},{"paper_id":13281,"author_seq":203,"given_name":13277,"surname":13278,"affiliation":63,"orcid":63},"Recognizing non-standard entity types and relations, such as B2B products, product classes and their producers, in news and forum texts is important in application areas such as supply chain monitoring and market research. However, there is a decided lack of annotated corpora and annotation guidelines in this domain. In this work, we present a corpus study, an annotation schema and associated guidelines, for the annotation of product entity and company-product relation mentions. We find that although product mentions are often realized as noun phrases, defining their exact extent is difficult due to high boundary ambiguity and the broad syntactic and semantic variety of their surface realizations. We also describe our ongoing annotation effort, and present a preliminary corpus of English web and social media documents annotated according to the proposed guidelines.",{"paper_id":13296,"title":13297,"year":81,"month":855,"day":63,"doi":13298,"resource_url":13299,"first_page":63,"last_page":63,"pdf_url":13300,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13301,"paper_type":860,"authors":13302,"abstract":13307},"lrec2018-main-705","Portuguese Named Entity Recognition using Conditional Random Fields and Local Grammars","10.63317\u002F45ggu5xzexsm","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-705","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F309.pdf","pirovani-oliveira-2018-portuguese",[13303,13305],{"paper_id":13296,"author_seq":247,"given_name":3146,"surname":13304,"affiliation":63,"orcid":63},"Pirovani",{"paper_id":13296,"author_seq":232,"given_name":6541,"surname":13306,"affiliation":63,"orcid":63},"Oliveira","Named Entity Recognition involves automatically identifying and classifying entities such as persons, places, and organizations, and it is a very important task in Information Extraction. Conditional Random Fields is a probabilistic method for structured prediction, which can be used in this task. This paper presents the use of Conditional Random Fields for Named Entity Recognition in Portuguese texts considering the term classification obtained by a Local Grammar as an additional informed feature. Local grammars are handmade rules to identify named entities within the text. The Golden Collection of the First and Second HAREM considered as a reference for Named Entity Recognition systems in Portuguese were used as training and test sets respectively. The results obtained outperform the results competitive systems reported in the literature.",{"paper_id":13309,"title":13310,"year":81,"month":855,"day":63,"doi":13311,"resource_url":13312,"first_page":63,"last_page":63,"pdf_url":13313,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13314,"paper_type":860,"authors":13315,"abstract":13325},"lrec2018-main-706","M-CNER: A Corpus for Chinese Named Entity Recognition in Multi-Domains","10.63317\u002F3f4ymwyr9s39","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-706","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F682.pdf","lu-etal-2018-cner",[13316,13318,13320,13322,13324],{"paper_id":13309,"author_seq":247,"given_name":13317,"surname":13224,"affiliation":63,"orcid":63},"Qi",{"paper_id":13309,"author_seq":232,"given_name":13319,"surname":3503,"affiliation":63,"orcid":63},"YaoSheng",{"paper_id":13309,"author_seq":218,"given_name":13321,"surname":1591,"affiliation":63,"orcid":63},"Zhenghua",{"paper_id":13309,"author_seq":203,"given_name":13323,"surname":2401,"affiliation":63,"orcid":63},"Wenliang",{"paper_id":13309,"author_seq":188,"given_name":1503,"surname":2381,"affiliation":63,"orcid":63},"In this paper, we present a new corpus for Chinese Named Entity Recognition (NER) from three domains : human-computer interaction, social media, and e-commerce. The annotation procedure is conducted in two rounds. In the first round, one sentence is annotated by more than one persons independently. In the second round, the experts discuss the sentences for which the annotators do not make agreements. Finally, we obtain a corpus which have five data sets in three domains. We further evaluate three popular models on the newly created data sets. The experimental results show that the system based on Bi-LSTM-CRF performs the best among the comparison systems on all the data sets. The corpus can be used for further studies in research community.",{"paper_id":13327,"title":13328,"year":81,"month":855,"day":63,"doi":13329,"resource_url":13330,"first_page":63,"last_page":63,"pdf_url":13331,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13332,"paper_type":860,"authors":13333,"abstract":13342},"lrec2018-main-707","SlugNERDS: A Named Entity Recognition Tool for Open Domain Dialogue Systems","10.63317\u002F49mtbrmt8v7n","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-707","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F819.pdf","bowden-etal-2018-slugnerds",[13334,13335,13337,13338,13341],{"paper_id":13327,"author_seq":247,"given_name":2184,"surname":12627,"affiliation":63,"orcid":63},{"paper_id":13327,"author_seq":232,"given_name":13336,"surname":3703,"affiliation":63,"orcid":63},"Jiaqi",{"paper_id":13327,"author_seq":218,"given_name":12002,"surname":12003,"affiliation":63,"orcid":63},{"paper_id":13327,"author_seq":203,"given_name":13339,"surname":13340,"affiliation":63,"orcid":63},"Amita","Misra",{"paper_id":13327,"author_seq":188,"given_name":11996,"surname":11997,"affiliation":63,"orcid":63},"In dialogue systems, the tasks of named entity recognition (NER) and named entity linking (NEL) are vital preprocessing steps for understanding user intent, especially in open domain interaction where we cannot rely on domain-specific inference. UCSC's effort as one of the funded teams in the 2017 Amazon Alexa Prize Contest has yielded Slugbot, an open domain social bot, aimed at casual conversation. We discovered  several challenges specifically associated with both NER and NEL when building Slugbot, such as that the NE labels are too coarse-grained or the entity types are not linked to a useful ontology. Moreover, we have discovered that traditional approaches do not perform well in our context:  even systems designed to operate on tweets or other social media data do not work well in dialogue systems. In this paper, we introduce Slugbot's Named Entity Recognition for dialogue Systems (SlugNERDS), a NER and NEL tool which is optimized to address these issues. We describe two new resources that we are building as part of this work: SlugEntityDB and SchemaActuator. We believe these resources will be useful for the research community.",{"paper_id":13344,"title":13345,"year":81,"month":855,"day":63,"doi":13346,"resource_url":13347,"first_page":63,"last_page":63,"pdf_url":13348,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13349,"paper_type":860,"authors":13350,"abstract":13356},"lrec2018-main-708","Transfer Learning for Named-Entity Recognition with Neural Networks","10.63317\u002F3zgnr6mgyt6m","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-708","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F878.pdf","lee-etal-2018-transfer",[13351,13353,13354],{"paper_id":13344,"author_seq":247,"given_name":13352,"surname":4032,"affiliation":63,"orcid":63},"Ji Young",{"paper_id":13344,"author_seq":232,"given_name":9939,"surname":9940,"affiliation":63,"orcid":63},{"paper_id":13344,"author_seq":218,"given_name":1474,"surname":13355,"affiliation":63,"orcid":63},"Szolovits","Recent approaches based on artificial neural networks (ANNs) have shown promising results for named-entity recognition (NER). In order to achieve high performances, ANNs need to be trained on a large labeled dataset. However, labels might be difficult to obtain for the dataset on which the user wants to perform NER: label scarcity is particularly pronounced for patient note de-identification, which is an instance  of NER. In this work, we analyze to what extent transfer learning may address this issue. In particular, we demonstrate that transferring an ANN model trained on a large labeled dataset to another dataset with a limited number of labels improves upon the state-of-the-art results on two different datasets for patient note de-identification.",{"paper_id":13358,"title":13359,"year":81,"month":855,"day":63,"doi":13360,"resource_url":13361,"first_page":63,"last_page":63,"pdf_url":13362,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13363,"paper_type":860,"authors":13364,"abstract":13369},"lrec2018-main-709","ForFun 1.0: Prague Database of Forms and Functions – An Invaluable Resource for Linguistic Research","10.63317\u002F2f3x7oa3etnu","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-709","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F617.pdf","mikulova-bejcek-2018-forfun",[13365,13367],{"paper_id":13358,"author_seq":247,"given_name":4656,"surname":13366,"affiliation":63,"orcid":63},"Mikulová",{"paper_id":13358,"author_seq":232,"given_name":5152,"surname":13368,"affiliation":63,"orcid":63},"Bejček","In this paper, we introduce the first version of ForFun, Prague Database of Forms and Functions, as an invaluable resource for profound linguistic research, particularly in describing syntactic functions and their formal realizations. ForFun is built with the use of already existing richly syntactically annotated corpora, collectively called Prague Dependency Treebanks. ForFun brings this complex annotation of Czech sentences closer to researchers. We demonstrate that ForFun 1.0 provides valuable and rich material allowing to elaborate various syntactic issues in depth. We believe that nowadays when corpus linguistics differs from traditional linguistics in its insistence on a systematic study of authentic examples of language in use, our database will contribute to the comprehensive syntactic description.",{"paper_id":13371,"title":13372,"year":81,"month":855,"day":63,"doi":13373,"resource_url":13374,"first_page":63,"last_page":63,"pdf_url":13375,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13376,"paper_type":860,"authors":13377,"abstract":13392},"lrec2018-main-710","The LIA Treebank of Spoken Norwegian Dialects","10.63317\u002F4jthgd5nvfw3","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-710","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F642.pdf","ovrelid-etal-2018-lia",[13378,13379,13381,13384,13386,13389],{"paper_id":13371,"author_seq":247,"given_name":5081,"surname":5082,"affiliation":63,"orcid":63},{"paper_id":13371,"author_seq":232,"given_name":1477,"surname":13380,"affiliation":63,"orcid":63},"Kåsen",{"paper_id":13371,"author_seq":218,"given_name":13382,"surname":13383,"affiliation":63,"orcid":63},"Kristin","Hagen",{"paper_id":13371,"author_seq":203,"given_name":7719,"surname":13385,"affiliation":63,"orcid":63},"Nøklestad",{"paper_id":13371,"author_seq":188,"given_name":13387,"surname":13388,"affiliation":63,"orcid":63},"Per Erik","Solberg",{"paper_id":13371,"author_seq":172,"given_name":13390,"surname":13391,"affiliation":63,"orcid":63},"Janne Bondi","Johannessen","This article presents the LIA treebank of transcribed spoken   Norwegian dialects. It consists of dialect recordings made in the   period between 1950--1990, which have been digitised, transcribed,   and subsequently annotated with morphological and dependency-style   syntactic analysis as part of the LIA (Language Infrastructure made   Accessible) project at the University of Oslo. In this article, we   describe the LIA material of dialect recordings and its transcription,   transliteration and further morphosyntactic annotation. We focus in   particular on the extension of the native NDT annotation scheme to   spoken language phenomena, such as pauses and various types of   disfluencies, and present the subsequent conversion of the treebank   to the Universal Dependencies scheme. The treebank currently   consists of 13,608 tokens, distributed over 1396 segments taken from   three different dialects of spoken Norwegian. The LIA treebank annotation is an on-going effort and future releases will extend on the current data set.",{"paper_id":13394,"title":13395,"year":81,"month":855,"day":63,"doi":13396,"resource_url":13397,"first_page":63,"last_page":63,"pdf_url":13398,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13399,"paper_type":860,"authors":13400,"abstract":13403},"lrec2018-main-711","Errator: a Tool to Help Detect Annotation Errors in the Universal Dependencies Project","10.63317\u002F3xf43p3vwa6p","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-711","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F652.pdf","wisniewski-2018-errator",[13401],{"paper_id":13394,"author_seq":247,"given_name":2218,"surname":13402,"affiliation":63,"orcid":63},"Wisniewski","Enforcing guidelines compliance is today one of the main challenge faced by the Universal Dependencies project. This work introduces ERRATOR, a set of tools implementing the annotation variation principle that can be used to help annotators find and correct errors in the different layers of annotations of UD treebanks. The results of a first annotation campaign that used ERRATOR to correct and harmonize the annotations of the different French corpora are also described.",{"paper_id":13405,"title":13406,"year":81,"month":855,"day":63,"doi":13407,"resource_url":13408,"first_page":63,"last_page":63,"pdf_url":13409,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13410,"paper_type":860,"authors":13411,"abstract":13426},"lrec2018-main-712","SandhiKosh: A Benchmark Corpus for Evaluating Sanskrit Sandhi Tools","10.63317\u002F3hbzopxzt5fx","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-712","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F755.pdf","bhardwaj-etal-2018-sandhikosh",[13412,13415,13418,13420,13423],{"paper_id":13405,"author_seq":247,"given_name":13413,"surname":13414,"affiliation":63,"orcid":63},"Shubham","Bhardwaj",{"paper_id":13405,"author_seq":232,"given_name":13416,"surname":13417,"affiliation":63,"orcid":63},"Neelamadhav","Gantayat",{"paper_id":13405,"author_seq":218,"given_name":6988,"surname":13419,"affiliation":63,"orcid":63},"Chaturvedi",{"paper_id":13405,"author_seq":203,"given_name":13421,"surname":13422,"affiliation":63,"orcid":63},"Rahul","Garg",{"paper_id":13405,"author_seq":188,"given_name":13424,"surname":13425,"affiliation":63,"orcid":63},"Sumeet","Agarwal","Sanskrit is an ancient Indian language. Several important texts which are of interest to people all over the world today were written in Sanskrit. The Sanskrit grammar has a precise and complete specification given in the text Astadhyayi by Panini. This has led to the development of a number of {\\em  Sanskrit Computational Linguistics} tools for processing and analyzing Sanskrit texts. Unfortunately, there has been no effort to standardize and critically validate these tools. In this paper, we develop a Sanskrit benchmark called SandhiKosh to evaluate the completeness and accuracy of Sanskrit Sandhi tools. We present the results of this benchmark on three most prominent Sanskrit tools and demonstrate that these tools have substantial scope for improvement. This benchmark will be freely available to researchers worldwide and we hope it will help everyone working in this area evaluate and validate their tools.",{"paper_id":13428,"title":13429,"year":81,"month":855,"day":63,"doi":13430,"resource_url":13431,"first_page":63,"last_page":63,"pdf_url":13432,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13433,"paper_type":860,"authors":13434,"abstract":13440},"lrec2018-main-713","Czech Legal Text Treebank 2.0","10.63317\u002F4nbv342ap5zw","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-713","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F799.pdf","kriz-hladka-2018-czech",[13435,13437],{"paper_id":13428,"author_seq":247,"given_name":11237,"surname":13436,"affiliation":63,"orcid":63},"Kríž",{"paper_id":13428,"author_seq":232,"given_name":13438,"surname":13439,"affiliation":63,"orcid":63},"Barbora","Hladká","The Czech Legal Text Treebank 2.0 (CLTT 2.0) contains texts that come from the legal domain and are manually syntactically annotated. The syntactic annotation in CLTT 2.0 is more elaborate than in CLTT 1.0. In addition, CLTT 2.0 contains two new annotation layers, namely the layer of entities and the layer of semantic entity relations. In total, CLTT 2.0 consists of two documents, 1,121 sentences and 40,950 tokens.",{"paper_id":13442,"title":13443,"year":81,"month":855,"day":63,"doi":13444,"resource_url":13445,"first_page":63,"last_page":63,"pdf_url":13446,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13447,"paper_type":860,"authors":13448,"abstract":13469},"lrec2018-main-714","Creation of a Balanced State-of-the-Art Multilayer Corpus for NLU","10.63317\u002F3rob6twf9o8r","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-714","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F935.pdf","gruzitis-etal-2018-creation",[13449,13452,13455,13458,13460,13463,13466],{"paper_id":13442,"author_seq":247,"given_name":13450,"surname":13451,"affiliation":63,"orcid":63},"Normunds","Gruzitis",{"paper_id":13442,"author_seq":232,"given_name":13453,"surname":13454,"affiliation":63,"orcid":63},"Lauma","Pretkalnina",{"paper_id":13442,"author_seq":218,"given_name":13456,"surname":13457,"affiliation":63,"orcid":63},"Baiba","Saulite",{"paper_id":13442,"author_seq":203,"given_name":1172,"surname":13459,"affiliation":63,"orcid":63},"Rituma",{"paper_id":13442,"author_seq":188,"given_name":13461,"surname":13462,"affiliation":63,"orcid":63},"Gunta","Nespore-Berzkalne",{"paper_id":13442,"author_seq":172,"given_name":13464,"surname":13465,"affiliation":63,"orcid":63},"Arturs","Znotins",{"paper_id":13442,"author_seq":155,"given_name":13467,"surname":13468,"affiliation":63,"orcid":63},"Peteris","Paikens","This paper presents a work in progress to create a multilayered syntactically and semantically annotated text corpus for Latvian. The broad application area we address is natural language understanding (NLU), while more specific applications are abstractive text summarization and knowledge base population, which are required by the project industrial partner, Latvian information agency LETA, for the automation of various media monitoring processes. Both the multilayered corpus and the downstream applications are anchored in cross-lingual state-of-the-art representations: Universal Dependencies (UD), FrameNet, PropBank and Abstract Meaning Representation (AMR). In this paper, we particularly focus on the consecutive annotation of the treebank and framebank layers. We also draw links to the ultimate AMR layer and the auxiliary named entity and coreference annotation layers. Since we are aiming at a medium-sized still general-purpose corpus for a less-resourced language, an important aspect we consider is the variety and balance of the corpus in terms of genres, authors and lexical units.",{"paper_id":13471,"title":13472,"year":81,"month":855,"day":63,"doi":13473,"resource_url":13474,"first_page":63,"last_page":63,"pdf_url":13475,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13476,"paper_type":860,"authors":13477,"abstract":13483},"lrec2018-main-715","Test Sets for Chinese Nonlocal Dependency Parsing","10.63317\u002F3j5ekfvvodye","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-715","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F981.pdf","duan-schuler-2018-test",[13478,13481],{"paper_id":13471,"author_seq":247,"given_name":13479,"surname":13480,"affiliation":63,"orcid":63},"Manjuan","Duan",{"paper_id":13471,"author_seq":232,"given_name":6789,"surname":13482,"affiliation":63,"orcid":63},"Schuler","Chinese is a language rich in nonlocal dependencies. Correctly resolving these dependencies is crucial in understanding the predicate-argument structure of a sentence. Making full use of the trace annotations in the Penn Chinese Treebank, this research contributes several test sets of Chinese nonlocal dependencies which occur in different grammatical constructions. These datasets can be used by an automatic dependency parser to evaluate its performance on nonlocal dependency resolution in various syntactic constructions in Chinese.",{"paper_id":13485,"title":13486,"year":81,"month":855,"day":63,"doi":13487,"resource_url":13488,"first_page":63,"last_page":63,"pdf_url":13489,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13490,"paper_type":860,"authors":13491,"abstract":13499},"lrec2018-main-716","Adding Syntactic Annotations to Flickr30k Entities Corpus for Multimodal Ambiguous Prepositional-Phrase Attachment Resolution","10.63317\u002F57dinhnuy42x","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-716","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F992.pdf","delecraz-etal-2018-adding",[13492,13494,13495,13496],{"paper_id":13485,"author_seq":247,"given_name":5712,"surname":13493,"affiliation":63,"orcid":63},"Delecraz",{"paper_id":13485,"author_seq":232,"given_name":1127,"surname":1128,"affiliation":63,"orcid":63},{"paper_id":13485,"author_seq":218,"given_name":3852,"surname":3853,"affiliation":63,"orcid":63},{"paper_id":13485,"author_seq":203,"given_name":13497,"surname":13498,"affiliation":63,"orcid":63},"Benoit","Favre","We propose in this paper to add to the captions of the Flickr30k Entities corpus some syntactic annotations in order to study the joint processing of image and language features for the Preposition-Phrase attachment disambiguation task. The annotation has been performed on the English version of the captions and automatically projected on their French and German translations.",{"paper_id":13501,"title":13502,"year":81,"month":855,"day":63,"doi":13503,"resource_url":13504,"first_page":63,"last_page":63,"pdf_url":13505,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13506,"paper_type":860,"authors":13507,"abstract":13514},"lrec2018-main-717","Analyzing Middle High German Syntax with RDF and SPARQL","10.63317\u002F2mq2ne2by9x6","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-717","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1016.pdf","chiarcos-etal-2018-analyzing",[13508,13509,13511,13512],{"paper_id":13501,"author_seq":247,"given_name":904,"surname":2684,"affiliation":63,"orcid":63},{"paper_id":13501,"author_seq":232,"given_name":2202,"surname":13510,"affiliation":63,"orcid":63},"Kosmehl",{"paper_id":13501,"author_seq":218,"given_name":904,"surname":8391,"affiliation":63,"orcid":63},{"paper_id":13501,"author_seq":203,"given_name":2301,"surname":13513,"affiliation":63,"orcid":63},"Sukhareva","The paper presents technological foundations for an empirical study of Middle High German (MHG) syntax. We aim to analyze the diachronic changes of MHG syntax on the example of direct and indirect object alterations in the middle field. In the absence of syntactically annotated corpora, we provide a rule-based shallow parser and an enrichment pipeline with the purpose of quantitative evaluation of a qualitative hypothesis. % It discusses how quantitative evaluation of qualitative methods can be used to open up a new prospective on a well-studied phenomenon. We provide a publicaly available enrichment and annotation pipeline grounded. A technologically innovative aspect is the application of CoNLL-RDF and SPARQL Update for parsing.",{"paper_id":13516,"title":13517,"year":81,"month":855,"day":63,"doi":13518,"resource_url":13519,"first_page":63,"last_page":63,"pdf_url":13520,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13521,"paper_type":860,"authors":13522,"abstract":13529},"lrec2018-main-718","Cheating a Parser to Death: Data-driven Cross-Treebank Annotation Transfer","10.63317\u002F4wv3nqvmyrmk","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-718","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1101.pdf","seddah-etal-2018-cheating",[13523,13524,13525,13526,13527],{"paper_id":13516,"author_seq":247,"given_name":11621,"surname":11622,"affiliation":63,"orcid":63},{"paper_id":13516,"author_seq":232,"given_name":5665,"surname":2115,"affiliation":63,"orcid":63},{"paper_id":13516,"author_seq":218,"given_name":6238,"surname":6239,"affiliation":63,"orcid":63},{"paper_id":13516,"author_seq":203,"given_name":11251,"surname":11252,"affiliation":63,"orcid":63},{"paper_id":13516,"author_seq":188,"given_name":4656,"surname":13528,"affiliation":63,"orcid":63},"Candito","We present an efficient and accurate method for transferring annotations between two different treebanks of the same language. This method led to the creation of a new instance of the French Treebank (Abeillé et al., 2003), which follows the Universal Dependency annotation scheme and which was proposed to the participants of the CoNLL 2017 Universal Dependency parsing shared task (Zeman et al., 2017). Strong results from an evaluation on our gold standard (94.75% of LAS, 99.40% UAS on the test set) demonstrate the quality of this new annotated data set and validate our approach.",{"paper_id":13531,"title":13532,"year":81,"month":855,"day":63,"doi":13533,"resource_url":13534,"first_page":63,"last_page":63,"pdf_url":13535,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13536,"paper_type":860,"authors":13537,"abstract":13549},"lrec2018-main-719","Universal Dependencies and Quantitative Typological Trends. A Case Study on Word Order","10.63317\u002F2jh455cfknyt","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-719","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F1109.pdf","alzetta-etal-2018-universal",[13538,13541,13544,13547],{"paper_id":13531,"author_seq":247,"given_name":13539,"surname":13540,"affiliation":63,"orcid":63},"Chiara","Alzetta",{"paper_id":13531,"author_seq":232,"given_name":13542,"surname":13543,"affiliation":63,"orcid":63},"Felice","Dell’Orletta",{"paper_id":13531,"author_seq":218,"given_name":13545,"surname":13546,"affiliation":63,"orcid":63},"Simonetta","Montemagni",{"paper_id":13531,"author_seq":203,"given_name":2996,"surname":13548,"affiliation":63,"orcid":63},"Venturi","The paper presents a new methodology aimed at acquiring typological evidence from “gold” treebanks for different languages. In particular, it investigates whether and to what extent algorithms developed for assessing the plausibility of automatically produced syntactic annotations could contribute to shed light on key issues of the linguistic typological literature. It reports the first and promising results of a case study focusing on word order patterns carried out on three different languages (English, Italian and Spanish).",{"paper_id":13551,"title":13552,"year":81,"month":855,"day":63,"doi":13553,"resource_url":13554,"first_page":63,"last_page":63,"pdf_url":13555,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13556,"paper_type":860,"authors":13557,"abstract":13563},"lrec2018-main-720","Undersampling Improves Hypernymy Prototypicality Learning","10.63317\u002F4z4yktramww2","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-720","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F119.pdf","washio-kato-2018-undersampling",[13558,13561],{"paper_id":13551,"author_seq":247,"given_name":13559,"surname":13560,"affiliation":63,"orcid":63},"Koki","Washio",{"paper_id":13551,"author_seq":232,"given_name":13562,"surname":8023,"affiliation":63,"orcid":63},"Tsuneaki","This paper focuses on supervised hypernymy detection using distributional representations for unknown word pairs. Levy et al. (2015) demonstrated that supervised hypernymy detection suffers from overfitting hypernyms in training data. We show that the problem of overfitting on this task is caused by a characteristic of datasets, which stems from the inherent structure of the language resources used, hierarchical thesauri. The simple data preprocessing method proposed in this paper alleviates this problem. To be more precise, we demonstrate through experiments that the problem that hypernymy classifiers overfit hypernyms in training data comes from a skewed word frequency distribution brought by the quasi-tree structure of a thesaurus, which is a major resource of lexical semantic relation data, and propose a simple undersampling method based on word frequencies that can effectively alleviate overfitting and improve distributional prototypicality learning for unknown word pairs.",{"paper_id":13565,"title":13566,"year":81,"month":855,"day":63,"doi":13567,"resource_url":13568,"first_page":63,"last_page":63,"pdf_url":13569,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13570,"paper_type":860,"authors":13571,"abstract":13579},"lrec2018-main-721","Interoperability of Language-related Information: Mapping the BLL Thesaurus to Lexvo and Glottolog","10.63317\u002F542ch8k7avxi","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-721","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F259.pdf","dimitrova-etal-2018-interoperability",[13572,13574,13575,13576,13578],{"paper_id":13565,"author_seq":247,"given_name":1266,"surname":13573,"affiliation":63,"orcid":63},"Dimitrova",{"paper_id":13565,"author_seq":232,"given_name":904,"surname":8391,"affiliation":63,"orcid":63},{"paper_id":13565,"author_seq":218,"given_name":904,"surname":2684,"affiliation":63,"orcid":63},{"paper_id":13565,"author_seq":203,"given_name":12691,"surname":13577,"affiliation":63,"orcid":63},"Renner-Westermann",{"paper_id":13565,"author_seq":188,"given_name":3326,"surname":8389,"affiliation":63,"orcid":63},"Since 2013, the thesaurus of the Bibliography of Linguistic Literature (BLL Thesaurus) has been applied in the context of the Lin|gu|is|tik portal, a hub for linguistically relevant information. Several consecutive projects focus on the modeling of the BLL Thesaurus as ontology and its linking to terminological repositories in the Linguistic Linked Open Data (LLOD) cloud. Those mappings facilitate the connection between the Lin|gu|is|tik portal and the cloud. In the paper, we describe the current efforts to establish interoperability between the language-related index terms and repositories providing language identifiers for the web of Linked Data. After an introduction of Lexvo and Glottolog, we outline the scope, the structure, and the peculiarities of the BLL Thesaurus. We discuss the challenges for the design of scientifically plausible language classification and the linking between divergent classifications. We describe the prototype of the linking model and propose pragmatic solutions for structural or conceptual conflicts. Additionally, we depict the benefits from the envisaged interoperability - for the Lin|gu|is|tik portal, and the Linked Open Data Community in general.",{"paper_id":13581,"title":13582,"year":81,"month":855,"day":63,"doi":13583,"resource_url":13584,"first_page":63,"last_page":63,"pdf_url":13585,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13586,"paper_type":860,"authors":13587,"abstract":13593},"lrec2018-main-722","Browsing and Supporting Pluricentric Global Wordnet, or just your Wordnet of Interest","10.63317\u002F2fovmp3hewob","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-722","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F277.pdf","branco-etal-2018-browsing",[13588,13589,13591,13592],{"paper_id":13581,"author_seq":247,"given_name":1332,"surname":1333,"affiliation":63,"orcid":63},{"paper_id":13581,"author_seq":232,"given_name":13590,"surname":1333,"affiliation":63,"orcid":63},"Ruben",{"paper_id":13581,"author_seq":218,"given_name":10000,"surname":10001,"affiliation":63,"orcid":63},{"paper_id":13581,"author_seq":203,"given_name":3884,"surname":4259,"affiliation":63,"orcid":63},"In this paper we proceed with a systematic gathering of design requirements for wordnet browsers that permit to consult the content of wordnets. This is undertaken together with a review of the functionalities of existing browsers. On the basis of this analysis, we present a new wordnet browser we developed that meets these requirements and thus complies with the most ample range of design features. This is an open source browser that is freely distributed and can be reused by anyone interested in doing research on or just using wordnets. We also introduce the notion of a pluricentric global wordnet, for whose undertaking this new advanced browser appears as an important instrument and motivation. This is a promising operative conception for a bootstrapped yet effective process towards the ultimate global wordnet, where all individual wordnets from all languages are meant to eventually converge together, in spite of the plurality of their formats, licenses, depth, etc. that is intrinsic to an inherently plural endeavor undertaken by multiple actors under multiple constraints across the world.",{"paper_id":13595,"title":13596,"year":81,"month":855,"day":63,"doi":13597,"resource_url":13598,"first_page":63,"last_page":63,"pdf_url":13599,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13600,"paper_type":860,"authors":13601,"abstract":13608},"lrec2018-main-723","Cross-checking WordNet and SUMO Using Meronymy","10.63317\u002F5ovgksxyi4e5","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-723","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F308.pdf","alvez-etal-2018-cross",[13602,13605,13607],{"paper_id":13595,"author_seq":247,"given_name":13603,"surname":13604,"affiliation":63,"orcid":63},"Javier","Álvez",{"paper_id":13595,"author_seq":232,"given_name":8039,"surname":13606,"affiliation":63,"orcid":63},"Gonzalez-Dios",{"paper_id":13595,"author_seq":218,"given_name":6777,"surname":6778,"affiliation":63,"orcid":63},"We report on the practical application of a black-box testing methodology for the validation of the knowledge encoded in WordNet, SUMO and their mapping by using automated theorem provers. Our proposal is based on the part-whole information provided by WordNet, out of which we automatically create a large set of tests. Our experimental results confirm that the proposed system enables the validation of some pieces of information and also the detection of missing information or inconsistencies among these resources.",{"paper_id":13610,"title":13611,"year":81,"month":855,"day":63,"doi":13612,"resource_url":13613,"first_page":63,"last_page":63,"pdf_url":13614,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13615,"paper_type":860,"authors":13616,"abstract":13621},"lrec2018-main-724","Extended HowNet 2.0 – An Entity-Relation Common-Sense Representation Model","10.63317\u002F3s9b4etqe8vj","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-724","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F547.pdf","ma-shih-2018-extended",[13617,13618],{"paper_id":13610,"author_seq":247,"given_name":3399,"surname":3400,"affiliation":63,"orcid":63},{"paper_id":13610,"author_seq":232,"given_name":13619,"surname":13620,"affiliation":63,"orcid":63},"Yueh-Yin","Shih","In this paper, we propose Extended HowNet 2.0 – an entity-relation common-sense representation model. Comparing to HowNet and Extended HowNet, E-HowNet 2.0 has the following improvements: (a) Reorganizing the hierarchical structure of primitives and basic concepts; (b) Rich lexical information: In addition to sense definition, each entry of lexical sense may also include operational expressions as well as semantic functions which facilitate future semantic composition processes. (c) Improvement of sense definitions and sense definitions for basic concepts. (d) Developing a new automatic ontology reconstruction system. (e) Developing a query system called E-HowNet Relation Database for flexibly clustering concepts.We hope Extended HowNet 2.0 can bring significant benefits to the community of lexical semantics and natural language understanding.",{"paper_id":13623,"title":13624,"year":81,"month":855,"day":63,"doi":13625,"resource_url":13626,"first_page":63,"last_page":63,"pdf_url":13627,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13628,"paper_type":860,"authors":13629,"abstract":13633},"lrec2018-main-725","The Circumstantial Event Ontology (CEO) and ECB+\u002FCEO: an Ontology and Corpus for Implicit Causal Relations between Events","10.63317\u002F3f2f5efxevpi","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-725","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F603.pdf","segers-etal-2018-circumstantial",[13630,13631,13632],{"paper_id":13623,"author_seq":247,"given_name":9467,"surname":9468,"affiliation":63,"orcid":63},{"paper_id":13623,"author_seq":232,"given_name":1894,"surname":1895,"affiliation":63,"orcid":63},{"paper_id":13623,"author_seq":218,"given_name":4174,"surname":4175,"affiliation":63,"orcid":63},"In this paper, we describe the Circumstantial Event Ontology (CEO), a newly developed ontology for calamity events that models semantic circumstantial relations between event classes, where we define circumstantial as inferred implicit causal relations. The circumstantial relations are inferred from the assertions of the event classes that involve a change to the same property of a participant. Our model captures that the change yielded by one event, explains to people the happening of the next event when observed. We describe the meta model and the contents of the ontology, the creation of a manually annotated corpus for circumstantial relations based on ECB+ and the first results on the evaluation of the ontology.",{"paper_id":13635,"title":13636,"year":81,"month":855,"day":63,"doi":13637,"resource_url":13638,"first_page":63,"last_page":63,"pdf_url":13639,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13640,"paper_type":860,"authors":13641,"abstract":13647},"lrec2018-main-726","Profiling Medical Journal Articles Using a Gene Ontology Semantic Tagger","10.63317\u002F33uwuoy2gbhh","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-726","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F706.pdf","el-haj-etal-2018-profiling",[13642,13643,13644,13645],{"paper_id":13635,"author_seq":247,"given_name":11030,"surname":11031,"affiliation":63,"orcid":63},{"paper_id":13635,"author_seq":232,"given_name":3690,"surname":3833,"affiliation":63,"orcid":63},{"paper_id":13635,"author_seq":218,"given_name":3830,"surname":3831,"affiliation":63,"orcid":63},{"paper_id":13635,"author_seq":203,"given_name":13646,"surname":3836,"affiliation":63,"orcid":63},"Jo","In many areas of academic publishing, there is an explosion of literature, and sub-division of fields into subfields, leading to stove-piping where sub-communities of expertise become disconnected from each other. This is especially true in the genetics literature over the last 10 years where researchers are no longer able to maintain knowledge of previously related areas. This paper extends several approaches based on natural language processing and corpus linguistics which allow us to examine corpora derived from bodies of genetics literature and will help to make comparisons and improve retrieval methods using domain knowledge via an existing gene ontology. We derived two open access medical journal corpora from PubMed related to psychiatric genetics and immune disorder genetics. We created a novel Gene Ontology Semantic Tagger (GOST) and lexicon to annotate the corpora and are then able to compare subsets of literature to understand the relative distributions of genetic terminology, thereby enabling researchers to make improved connections between them.",{"paper_id":13649,"title":13650,"year":81,"month":855,"day":63,"doi":13651,"resource_url":13652,"first_page":63,"last_page":63,"pdf_url":13653,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13654,"paper_type":860,"authors":13655,"abstract":13659},"lrec2018-main-727","Towards a Conversation-Analytic Taxonomy of Speech Overlap","10.63317\u002F3wta4r4erj6p","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-727","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F792.pdf","gervits-scheutz-2018-towards",[13656,13657],{"paper_id":13649,"author_seq":247,"given_name":1211,"surname":1212,"affiliation":63,"orcid":63},{"paper_id":13649,"author_seq":232,"given_name":1244,"surname":13658,"affiliation":63,"orcid":63},"Scheutz","We present a taxonomy for classifying speech overlap in natural language dialogue. The scheme classifies overlap on the basis of several features, including onset point, local dialogue history, and management behavior. We describe the various dimensions of this scheme and show how it was applied to a corpus of remote, collaborative dialogue. Moving forward, this will serve as the basis for a computational model of speech overlap, and for use in artificial agents that interact with humans in social settings.",{"paper_id":13661,"title":13662,"year":81,"month":855,"day":63,"doi":13663,"resource_url":13664,"first_page":63,"last_page":63,"pdf_url":13665,"poster_url":63,"slide_url":63,"video_url":63,"supplementary_url":63,"bibkey":13666,"paper_type":860,"authors":13667,"abstract":13673},"lrec2018-main-728","Indian Language Wordnets and their Linkages with Princeton WordNet","10.63317\u002F34wj9mjkjpiq","https:\u002F\u002Flrec.elra.info\u002Flrec2018-main-728","http:\u002F\u002Fwww.lrec-conf.org\u002Fproceedings\u002Flrec2018\u002Fpdf\u002F936.pdf","kanojia-etal-2018-indian",[13668,13671,13672],{"paper_id":13661,"author_seq":247,"given_name":13669,"surname":13670,"affiliation":63,"orcid":63},"Diptesh","Kanojia",{"paper_id":13661,"author_seq":232,"given_name":2184,"surname":9609,"affiliation":63,"orcid":63},{"paper_id":13661,"author_seq":218,"given_name":1864,"surname":1865,"affiliation":63,"orcid":63},"Wordnets are rich lexico-semantic resources. Linked wordnets are extensions of wordnets, which link similar concepts in wordnets of different languages. Such resources are extremely useful in many Natural Language Processing (NLP) applications, primarily those based on knowledge-based approaches. In such approaches, these resources are considered as gold standard\u002Foracle. Thus, it is crucial that these resources hold correct information. Thereby, they are created by human experts. However, human experts in multiple languages are hard to come by. Thus, the community would benefit from sharing of such manually created resources. In this paper, we release mappings of 18 Indian language wordnets linked with Princeton WordNet. We believe that availability of such resources will have a direct impact on the progress in NLP for these languages."]