4 from itertools import filterfalse
5 from typing import List, Tuple, Union
8 class _lazyclassproperty:
9 def __init__(self, fn):
11 self.__doc__ = fn.__doc__
12 self.__name__ = fn.__name__
14 def __get__(self, obj, cls):
17 if not hasattr(cls, "_intern") or any(
18 cls._intern is getattr(superclass, "_intern", [])
19 for superclass in cls.__mro__[1:]
22 attrname = self.fn.__name__
23 if attrname not in cls._intern:
24 cls._intern[attrname] = self.fn(cls)
25 return cls._intern[attrname]
28 UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
33 A set of Unicode characters, for language-specific strings for
34 ``alphas``, ``nums``, ``alphanums``, and ``printables``.
35 A unicode_set is defined by a list of ranges in the Unicode character
36 set, in a class attribute ``_ranges``. Ranges can be specified using
37 2-tuples or a 1-tuple, such as::
45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
47 A unicode set can also be defined using multiple inheritance of other unicode sets::
49 class CJK(Chinese, Japanese, Korean):
53 _ranges: UnicodeRangeList = []
56 def _chars_for_ranges(cls):
58 for cc in cls.__mro__:
61 for rr in getattr(cc, "_ranges", ()):
62 ret.extend(range(rr[0], rr[-1] + 1))
63 return [chr(c) for c in sorted(set(ret))]
67 "all non-whitespace characters in this range"
68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
72 "all alphabetic characters in this range"
73 return "".join(filter(str.isalpha, cls._chars_for_ranges))
77 "all numeric digit characters in this range"
78 return "".join(filter(str.isdigit, cls._chars_for_ranges))
82 "all alphanumeric characters in this range"
83 return cls.alphas + cls.nums
87 "all characters in this range that are valid identifier characters, plus underscore '_'"
91 "".join(filter(str.isidentifier, cls._chars_for_ranges))
92 + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
93 + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
100 def identbodychars(cls):
102 all characters in this range that are valid identifier body characters,
111 [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()]
118 class pyparsing_unicode(unicode_set):
120 A namespace class for defining common language unicode_sets.
125 # define ranges in language character sets
126 _ranges: UnicodeRangeList = [
127 (0x0020, sys.maxunicode),
130 class BasicMultilingualPlane(unicode_set):
131 "Unicode set for the Basic Multilingual Plane"
132 _ranges: UnicodeRangeList = [
136 class Latin1(unicode_set):
137 "Unicode set for Latin-1 Unicode Character Range"
138 _ranges: UnicodeRangeList = [
143 class LatinA(unicode_set):
144 "Unicode set for Latin-A Unicode Character Range"
145 _ranges: UnicodeRangeList = [
149 class LatinB(unicode_set):
150 "Unicode set for Latin-B Unicode Character Range"
151 _ranges: UnicodeRangeList = [
155 class Greek(unicode_set):
156 "Unicode set for Greek Unicode Character Ranges"
157 _ranges: UnicodeRangeList = [
195 class Cyrillic(unicode_set):
196 "Unicode set for Cyrillic Unicode Character Range"
197 _ranges: UnicodeRangeList = [
208 class Chinese(unicode_set):
209 "Unicode set for Chinese Unicode Character Range"
210 _ranges: UnicodeRangeList = [
231 class Japanese(unicode_set):
232 "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"
233 _ranges: UnicodeRangeList = []
235 class Kanji(unicode_set):
236 "Unicode set for Kanji Unicode Character Range"
237 _ranges: UnicodeRangeList = [
242 class Hiragana(unicode_set):
243 "Unicode set for Hiragana Unicode Character Range"
244 _ranges: UnicodeRangeList = [
254 class Katakana(unicode_set):
255 "Unicode set for Katakana Unicode Character Range"
256 _ranges: UnicodeRangeList = [
268 class Hangul(unicode_set):
269 "Unicode set for Hangul (Korean) Unicode Character Range"
270 _ranges: UnicodeRangeList = [
290 class CJK(Chinese, Japanese, Hangul):
291 "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"
293 class Thai(unicode_set):
294 "Unicode set for Thai Unicode Character Range"
295 _ranges: UnicodeRangeList = [
300 class Arabic(unicode_set):
301 "Unicode set for Arabic Unicode Character Range"
302 _ranges: UnicodeRangeList = [
308 class Hebrew(unicode_set):
309 "Unicode set for Hebrew Unicode Character Range"
310 _ranges: UnicodeRangeList = [
322 class Devanagari(unicode_set):
323 "Unicode set for Devanagari Unicode Character Range"
324 _ranges: UnicodeRangeList = [
332 pyparsing_unicode.Japanese._ranges = (
333 pyparsing_unicode.Japanese.Kanji._ranges
334 + pyparsing_unicode.Japanese.Hiragana._ranges
335 + pyparsing_unicode.Japanese.Katakana._ranges
338 pyparsing_unicode.BMP = pyparsing_unicode.BasicMultilingualPlane
340 # add language identifiers using language Unicode
341 pyparsing_unicode.العربية = pyparsing_unicode.Arabic
342 pyparsing_unicode.中文 = pyparsing_unicode.Chinese
343 pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic
344 pyparsing_unicode.Ελληνικά = pyparsing_unicode.Greek
345 pyparsing_unicode.עִברִית = pyparsing_unicode.Hebrew
346 pyparsing_unicode.日本語 = pyparsing_unicode.Japanese
347 pyparsing_unicode.Japanese.漢字 = pyparsing_unicode.Japanese.Kanji
348 pyparsing_unicode.Japanese.カタカナ = pyparsing_unicode.Japanese.Katakana
349 pyparsing_unicode.Japanese.ひらがな = pyparsing_unicode.Japanese.Hiragana
350 pyparsing_unicode.한국어 = pyparsing_unicode.Korean
351 pyparsing_unicode.ไทย = pyparsing_unicode.Thai
352 pyparsing_unicode.देवनागरी = pyparsing_unicode.Devanagari