@conference{papavassiliou-prokopidis-piperidis:2016:WMT, author = "Papavassiliou, Vassilis and Prokopidis, Prokopis and Piperidis, Stelios", abstract = "This paper describes ILSP-ARC-pv42, the Institute for Language and Speech Processing/Athena Research and Innovation Center submission for the WMT 2016 Bilingual Document Alignment shared task. We describe several document and collection-aware features that our system explored in the context of the task. On the test dataset, our submission achieved a recall of 84.93%, even though it does not make use of any language-specific resources like bilingual lexica or MT output. Instead, our system is based on shallow features (including links to documents in the same webdomain, URLs, digits, image filenames and HTML structure) that can be easily extracted from web documents. We also present examples to show that when de-duplication issues in the test dataset are properly addressed, our system reaches a significantly higher recall of 92.5%.", address = "Berlin, Germany", booktitle = "Proceedings of the First Conference on Machine Translation", month = "August", pages = "733--739", publisher = "Association for Computational Linguistics", title = "{T}he {I}LSP/ARC submission to the {W}MT 2016 {B}ilingual {D}ocument {A}lignment {S}hared {T}ask ", url = "http://www.aclweb.org/anthology/W16-2375.pdf", year = "2016", }