eb40c5f0c8526208d434d762855d23079dc68b36
[SubU] /
1 """
2 Metadata about languages used by our model training code for our
3 SingleByteCharSetProbers.  Could be used for other things in the future.
4
5 This code is based on the language metadata from the uchardet project.
6 """
7
8 from string import ascii_letters
9 from typing import List, Optional
10
11 # TODO: Add Ukrainian (KOI8-U)
12
13
14 class Language:
15     """Metadata about a language useful for training models
16
17     :ivar name: The human name for the language, in English.
18     :type name: str
19     :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
20                     or use another catalog as a last resort.
21     :type iso_code: str
22     :ivar use_ascii: Whether or not ASCII letters should be included in trained
23                      models.
24     :type use_ascii: bool
25     :ivar charsets: The charsets we want to support and create data for.
26     :type charsets: list of str
27     :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
28                     `True`, you only need to add those not in the ASCII set.
29     :type alphabet: str
30     :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
31                             Wikipedia for training data.
32     :type wiki_start_pages: list of str
33     """
34
35     def __init__(
36         self,
37         name: Optional[str] = None,
38         iso_code: Optional[str] = None,
39         use_ascii: bool = True,
40         charsets: Optional[List[str]] = None,
41         alphabet: Optional[str] = None,
42         wiki_start_pages: Optional[List[str]] = None,
43     ) -> None:
44         super().__init__()
45         self.name = name
46         self.iso_code = iso_code
47         self.use_ascii = use_ascii
48         self.charsets = charsets
49         if self.use_ascii:
50             if alphabet:
51                 alphabet += ascii_letters
52             else:
53                 alphabet = ascii_letters
54         elif not alphabet:
55             raise ValueError("Must supply alphabet if use_ascii is False")
56         self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None
57         self.wiki_start_pages = wiki_start_pages
58
59     def __repr__(self) -> str:
60         param_str = ", ".join(
61             f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_")
62         )
63         return f"{self.__class__.__name__}({param_str})"
64
65
66 LANGUAGES = {
67     "Arabic": Language(
68         name="Arabic",
69         iso_code="ar",
70         use_ascii=False,
71         # We only support encodings that use isolated
72         # forms, because the current recommendation is
73         # that the rendering system handles presentation
74         # forms. This means we purposefully skip IBM864.
75         charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"],
76         alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ",
77         wiki_start_pages=["الصفحة_الرئيسية"],
78     ),
79     "Belarusian": Language(
80         name="Belarusian",
81         iso_code="be",
82         use_ascii=False,
83         charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"],
84         alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ",
85         wiki_start_pages=["Галоўная_старонка"],
86     ),
87     "Bulgarian": Language(
88         name="Bulgarian",
89         iso_code="bg",
90         use_ascii=False,
91         charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"],
92         alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя",
93         wiki_start_pages=["Начална_страница"],
94     ),
95     "Czech": Language(
96         name="Czech",
97         iso_code="cz",
98         use_ascii=True,
99         charsets=["ISO-8859-2", "WINDOWS-1250"],
100         alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ",
101         wiki_start_pages=["Hlavní_strana"],
102     ),
103     "Danish": Language(
104         name="Danish",
105         iso_code="da",
106         use_ascii=True,
107         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
108         alphabet="æøåÆØÅ",
109         wiki_start_pages=["Forside"],
110     ),
111     "German": Language(
112         name="German",
113         iso_code="de",
114         use_ascii=True,
115         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
116         alphabet="äöüßẞÄÖÜ",
117         wiki_start_pages=["Wikipedia:Hauptseite"],
118     ),
119     "Greek": Language(
120         name="Greek",
121         iso_code="el",
122         use_ascii=False,
123         charsets=["ISO-8859-7", "WINDOWS-1253"],
124         alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ",
125         wiki_start_pages=["Πύλη:Κύρια"],
126     ),
127     "English": Language(
128         name="English",
129         iso_code="en",
130         use_ascii=True,
131         charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
132         wiki_start_pages=["Main_Page"],
133     ),
134     "Esperanto": Language(
135         name="Esperanto",
136         iso_code="eo",
137         # Q, W, X, and Y not used at all
138         use_ascii=False,
139         charsets=["ISO-8859-3"],
140         alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ",
141         wiki_start_pages=["Vikipedio:Ĉefpaĝo"],
142     ),
143     "Spanish": Language(
144         name="Spanish",
145         iso_code="es",
146         use_ascii=True,
147         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
148         alphabet="ñáéíóúüÑÁÉÍÓÚÜ",
149         wiki_start_pages=["Wikipedia:Portada"],
150     ),
151     "Estonian": Language(
152         name="Estonian",
153         iso_code="et",
154         use_ascii=False,
155         charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"],
156         # C, F, Š, Q, W, X, Y, Z, Ž are only for
157         # loanwords
158         alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü",
159         wiki_start_pages=["Esileht"],
160     ),
161     "Finnish": Language(
162         name="Finnish",
163         iso_code="fi",
164         use_ascii=True,
165         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
166         alphabet="ÅÄÖŠŽåäöšž",
167         wiki_start_pages=["Wikipedia:Etusivu"],
168     ),
169     "French": Language(
170         name="French",
171         iso_code="fr",
172         use_ascii=True,
173         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
174         alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ",
175         wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"],
176     ),
177     "Hebrew": Language(
178         name="Hebrew",
179         iso_code="he",
180         use_ascii=False,
181         charsets=["ISO-8859-8", "WINDOWS-1255"],
182         alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ",
183         wiki_start_pages=["עמוד_ראשי"],
184     ),
185     "Croatian": Language(
186         name="Croatian",
187         iso_code="hr",
188         # Q, W, X, Y are only used for foreign words.
189         use_ascii=False,
190         charsets=["ISO-8859-2", "WINDOWS-1250"],
191         alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ",
192         wiki_start_pages=["Glavna_stranica"],
193     ),
194     "Hungarian": Language(
195         name="Hungarian",
196         iso_code="hu",
197         # Q, W, X, Y are only used for foreign words.
198         use_ascii=False,
199         charsets=["ISO-8859-2", "WINDOWS-1250"],
200         alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ",
201         wiki_start_pages=["Kezdőlap"],
202     ),
203     "Italian": Language(
204         name="Italian",
205         iso_code="it",
206         use_ascii=True,
207         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
208         alphabet="ÀÈÉÌÒÓÙàèéìòóù",
209         wiki_start_pages=["Pagina_principale"],
210     ),
211     "Lithuanian": Language(
212         name="Lithuanian",
213         iso_code="lt",
214         use_ascii=False,
215         charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
216         # Q, W, and X not used at all
217         alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž",
218         wiki_start_pages=["Pagrindinis_puslapis"],
219     ),
220     "Latvian": Language(
221         name="Latvian",
222         iso_code="lv",
223         use_ascii=False,
224         charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"],
225         # Q, W, X, Y are only for loanwords
226         alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž",
227         wiki_start_pages=["Sākumlapa"],
228     ),
229     "Macedonian": Language(
230         name="Macedonian",
231         iso_code="mk",
232         use_ascii=False,
233         charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
234         alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш",
235         wiki_start_pages=["Главна_страница"],
236     ),
237     "Dutch": Language(
238         name="Dutch",
239         iso_code="nl",
240         use_ascii=True,
241         charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"],
242         wiki_start_pages=["Hoofdpagina"],
243     ),
244     "Polish": Language(
245         name="Polish",
246         iso_code="pl",
247         # Q and X are only used for foreign words.
248         use_ascii=False,
249         charsets=["ISO-8859-2", "WINDOWS-1250"],
250         alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż",
251         wiki_start_pages=["Wikipedia:Strona_główna"],
252     ),
253     "Portuguese": Language(
254         name="Portuguese",
255         iso_code="pt",
256         use_ascii=True,
257         charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"],
258         alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú",
259         wiki_start_pages=["Wikipédia:Página_principal"],
260     ),
261     "Romanian": Language(
262         name="Romanian",
263         iso_code="ro",
264         use_ascii=True,
265         charsets=["ISO-8859-2", "WINDOWS-1250"],
266         alphabet="ăâîșțĂÂÎȘȚ",
267         wiki_start_pages=["Pagina_principală"],
268     ),
269     "Russian": Language(
270         name="Russian",
271         iso_code="ru",
272         use_ascii=False,
273         charsets=[
274             "ISO-8859-5",
275             "WINDOWS-1251",
276             "KOI8-R",
277             "MacCyrillic",
278             "IBM866",
279             "IBM855",
280         ],
281         alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ",
282         wiki_start_pages=["Заглавная_страница"],
283     ),
284     "Slovak": Language(
285         name="Slovak",
286         iso_code="sk",
287         use_ascii=True,
288         charsets=["ISO-8859-2", "WINDOWS-1250"],
289         alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ",
290         wiki_start_pages=["Hlavná_stránka"],
291     ),
292     "Slovene": Language(
293         name="Slovene",
294         iso_code="sl",
295         # Q, W, X, Y are only used for foreign words.
296         use_ascii=False,
297         charsets=["ISO-8859-2", "WINDOWS-1250"],
298         alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ",
299         wiki_start_pages=["Glavna_stran"],
300     ),
301     # Serbian can be written in both Latin and Cyrillic, but there's no
302     # simple way to get the Latin alphabet pages from Wikipedia through
303     # the API, so for now we just support Cyrillic.
304     "Serbian": Language(
305         name="Serbian",
306         iso_code="sr",
307         alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш",
308         charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"],
309         wiki_start_pages=["Главна_страна"],
310     ),
311     "Thai": Language(
312         name="Thai",
313         iso_code="th",
314         use_ascii=False,
315         charsets=["ISO-8859-11", "TIS-620", "CP874"],
316         alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛",
317         wiki_start_pages=["หน้าหลัก"],
318     ),
319     "Turkish": Language(
320         name="Turkish",
321         iso_code="tr",
322         # Q, W, and X are not used by Turkish
323         use_ascii=False,
324         charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"],
325         alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ",
326         wiki_start_pages=["Ana_Sayfa"],
327     ),
328     "Vietnamese": Language(
329         name="Vietnamese",
330         iso_code="vi",
331         use_ascii=False,
332         # Windows-1258 is the only common 8-bit
333         # Vietnamese encoding supported by Python.
334         # From Wikipedia:
335         # For systems that lack support for Unicode,
336         # dozens of 8-bit Vietnamese code pages are
337         # available.[1] The most common are VISCII
338         # (TCVN 5712:1993), VPS, and Windows-1258.[3]
339         # Where ASCII is required, such as when
340         # ensuring readability in plain text e-mail,
341         # Vietnamese letters are often encoded
342         # according to Vietnamese Quoted-Readable
343         # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
344         # though usage of either variable-width
345         # scheme has declined dramatically following
346         # the adoption of Unicode on the World Wide
347         # Web.
348         charsets=["WINDOWS-1258"],
349         alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY",
350         wiki_start_pages=["Chữ_Quốc_ngữ"],
351     ),
352 }