2 Metadata about languages used by our model training code for our
3 SingleByteCharSetProbers. Could be used for other things in the future.
5 This code is based on the language metadata from the uchardet project.
8 from string import ascii_letters
9 from typing import List, Optional
11 # TODO: Add Ukrainian (KOI8-U)
15 """Metadata about a language useful for training models
17 :ivar name: The human name for the language, in English.
19 :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
20 or use another catalog as a last resort.
22 :ivar use_ascii: Whether or not ASCII letters should be included in trained
25 :ivar charsets: The charsets we want to support and create data for.
26 :type charsets: list of str
27 :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
28 `True`, you only need to add those not in the ASCII set.
30 :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
31 Wikipedia for training data.
32 :type wiki_start_pages: list of str
37 name: Optional[str] = None,
38 iso_code: Optional[str] = None,
39 use_ascii: bool = True,
40 charsets: Optional[List[str]] = None,
41 alphabet: Optional[str] = None,
42 wiki_start_pages: Optional[List[str]] = None,
46 self.iso_code = iso_code
47 self.use_ascii = use_ascii
48 self.charsets = charsets
51 alphabet += ascii_letters
53 alphabet = ascii_letters
55 raise ValueError("Must supply alphabet if use_ascii is False")
56 self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
57 self.wiki_start_pages = wiki_start_pages
59 def __repr__(self) -> str:
60 param_str = ", ".join(
61 f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
63 return f"{self.__class__.__name__}({param_str})"
71 # We only support encodings that use isolated
72 # forms, because the current recommendation is
73 # that the rendering system handles presentation
74 # forms. This means we purposefully skip IBM864.
75 charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
76 alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
77 wiki_start_pages=["الصفحة_الرئيسية"],
79 "Belarusian": Language(
83 charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
84 alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
85 wiki_start_pages=["Галоўная_старонка"],
87 "Bulgarian": Language(
91 charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
92 alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
93 wiki_start_pages=["Начална_страница"],
99 charsets=["ISO-8859-2", "WINDOWS-1250"],
100 alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
101 wiki_start_pages=["Hlavní_strana"],
107 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
109 wiki_start_pages=["Forside"],
115 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
117 wiki_start_pages=["Wikipedia:Hauptseite"],
123 charsets=["ISO-8859-7", "WINDOWS-1253"],
124 alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
125 wiki_start_pages=["Πύλη:Κύρια"],
131 charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
132 wiki_start_pages=["Main_Page"],
134 "Esperanto": Language(
137 # Q, W, X, and Y not used at all
139 charsets=["ISO-8859-3"],
140 alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
141 wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
147 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
148 alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
149 wiki_start_pages=["Wikipedia:Portada"],
151 "Estonian": Language(
155 charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
156 # C, F, Š, Q, W, X, Y, Z, Ž are only for
158 alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
159 wiki_start_pages=["Esileht"],
165 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
166 alphabet="ÅÄÖŠŽåäöšž",
167 wiki_start_pages=["Wikipedia:Etusivu"],
173 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
174 alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
175 wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
181 charsets=["ISO-8859-8", "WINDOWS-1255"],
182 alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
183 wiki_start_pages=["עמוד_ראשי"],
185 "Croatian": Language(
188 # Q, W, X, Y are only used for foreign words.
190 charsets=["ISO-8859-2", "WINDOWS-1250"],
191 alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
192 wiki_start_pages=["Glavna_stranica"],
194 "Hungarian": Language(
197 # Q, W, X, Y are only used for foreign words.
199 charsets=["ISO-8859-2", "WINDOWS-1250"],
200 alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
201 wiki_start_pages=["Kezdőlap"],
207 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
208 alphabet="ÀÈÉÌÒÓÙàèéìòóù",
209 wiki_start_pages=["Pagina_principale"],
211 "Lithuanian": Language(
215 charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
216 # Q, W, and X not used at all
217 alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
218 wiki_start_pages=["Pagrindinis_puslapis"],
224 charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
225 # Q, W, X, Y are only for loanwords
226 alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
227 wiki_start_pages=["Sākumlapa"],
229 "Macedonian": Language(
233 charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
234 alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
235 wiki_start_pages=["Главна_страница"],
241 charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
242 wiki_start_pages=["Hoofdpagina"],
247 # Q and X are only used for foreign words.
249 charsets=["ISO-8859-2", "WINDOWS-1250"],
250 alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
251 wiki_start_pages=["Wikipedia:Strona_główna"],
253 "Portuguese": Language(
257 charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
258 alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
259 wiki_start_pages=["Wikipédia:Página_principal"],
261 "Romanian": Language(
265 charsets=["ISO-8859-2", "WINDOWS-1250"],
266 alphabet="ăâîșțĂÂÎȘȚ",
267 wiki_start_pages=["Pagina_principală"],
281 alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
282 wiki_start_pages=["Заглавная_страница"],
288 charsets=["ISO-8859-2", "WINDOWS-1250"],
289 alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
290 wiki_start_pages=["Hlavná_stránka"],
295 # Q, W, X, Y are only used for foreign words.
297 charsets=["ISO-8859-2", "WINDOWS-1250"],
298 alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
299 wiki_start_pages=["Glavna_stran"],
301 # Serbian can be written in both Latin and Cyrillic, but there's no
302 # simple way to get the Latin alphabet pages from Wikipedia through
303 # the API, so for now we just support Cyrillic.
307 alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
308 charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
309 wiki_start_pages=["Главна_страна"],
315 charsets=["ISO-8859-11", "TIS-620", "CP874"],
316 alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
317 wiki_start_pages=["หน้าหลัก"],
322 # Q, W, and X are not used by Turkish
324 charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
325 alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
326 wiki_start_pages=["Ana_Sayfa"],
328 "Vietnamese": Language(
332 # Windows-1258 is the only common 8-bit
333 # Vietnamese encoding supported by Python.
335 # For systems that lack support for Unicode,
336 # dozens of 8-bit Vietnamese code pages are
337 # available.[1] The most common are VISCII
338 # (TCVN 5712:1993), VPS, and Windows-1258.[3]
339 # Where ASCII is required, such as when
340 # ensuring readability in plain text e-mail,
341 # Vietnamese letters are often encoded
342 # according to Vietnamese Quoted-Readable
343 # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
344 # though usage of either variable-width
345 # scheme has declined dramatically following
346 # the adoption of Unicode on the World Wide
348 charsets=["WINDOWS-1258"],
349 alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
350 wiki_start_pages=["Chữ_Quốc_ngữ"],