git.reasoningtechnology.com Git - SubU/blob

   1 # helpers.py
   2 import html.entities
   3 import re
   4 import typing
   5
   6 from . import __diag__
   7 from .core import *
   8 from .util import _bslash, _flatten, _escape_regex_range_chars
   9
  10
  11 #
  12 # global helpers
  13 #
  14 def delimited_list(
  15     expr: Union[str, ParserElement],
  16     delim: Union[str, ParserElement] = ",",
  17     combine: bool = False,
  18     min: typing.Optional[int] = None,
  19     max: typing.Optional[int] = None,
  20     *,
  21     allow_trailing_delim: bool = False,
  22 ) -> ParserElement:
  23     """Helper to define a delimited list of expressions - the delimiter
  24     defaults to ','. By default, the list elements and delimiters can
  25     have intervening whitespace, and comments, but this can be
  26     overridden by passing ``combine=True`` in the constructor. If
  27     ``combine`` is set to ``True``, the matching tokens are
  28     returned as a single token string, with the delimiters included;
  29     otherwise, the matching tokens are returned as a list of tokens,
  30     with the delimiters suppressed.
  31
  32     If ``allow_trailing_delim`` is set to True, then the list may end with
  33     a delimiter.
  34
  35     Example::
  36
  37         delimited_list(Word(alphas)).parse_string("aa,bb,cc") # -> ['aa', 'bb', 'cc']
  38         delimited_list(Word(hexnums), delim=':', combine=True).parse_string("AA:BB:CC:DD:EE") # -> ['AA:BB:CC:DD:EE']
  39     """
  40     if isinstance(expr, str_type):
  41         expr = ParserElement._literalStringClass(expr)
  42
  43     dlName = "{expr} [{delim} {expr}]...{end}".format(
  44         expr=str(expr.copy().streamline()),
  45         delim=str(delim),
  46         end=" [{}]".format(str(delim)) if allow_trailing_delim else "",
  47     )
  48
  49     if not combine:
  50         delim = Suppress(delim)
  51
  52     if min is not None:
  53         if min < 1:
  54             raise ValueError("min must be greater than 0")
  55         min -= 1
  56     if max is not None:
  57         if min is not None and max <= min:
  58             raise ValueError("max must be greater than, or equal to min")
  59         max -= 1
  60     delimited_list_expr = expr + (delim + expr)[min, max]
  61
  62     if allow_trailing_delim:
  63         delimited_list_expr += Opt(delim)
  64
  65     if combine:
  66         return Combine(delimited_list_expr).set_name(dlName)
  67     else:
  68         return delimited_list_expr.set_name(dlName)
  69
  70
  71 def counted_array(
  72     expr: ParserElement,
  73     int_expr: typing.Optional[ParserElement] = None,
  74     *,
  75     intExpr: typing.Optional[ParserElement] = None,
  76 ) -> ParserElement:
  77     """Helper to define a counted list of expressions.
  78
  79     This helper defines a pattern of the form::
  80
  81         integer expr expr expr...
  82
  83     where the leading integer tells how many expr expressions follow.
  84     The matched tokens returns the array of expr tokens as a list - the
  85     leading count token is suppressed.
  86
  87     If ``int_expr`` is specified, it should be a pyparsing expression
  88     that produces an integer value.
  89
  90     Example::
  91
  92         counted_array(Word(alphas)).parse_string('2 ab cd ef')  # -> ['ab', 'cd']
  93
  94         # in this parser, the leading integer value is given in binary,
  95         # '10' indicating that 2 values are in the array
  96         binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))
  97         counted_array(Word(alphas), int_expr=binary_constant).parse_string('10 ab cd ef')  # -> ['ab', 'cd']
  98
  99         # if other fields must be parsed after the count but before the
 100         # list items, give the fields results names and they will
 101         # be preserved in the returned ParseResults:
 102         count_with_metadata = integer + Word(alphas)("type")
 103         typed_array = counted_array(Word(alphanums), int_expr=count_with_metadata)("items")
 104         result = typed_array.parse_string("3 bool True True False")
 105         print(result.dump())
 106
 107         # prints
 108         # ['True', 'True', 'False']
 109         # - items: ['True', 'True', 'False']
 110         # - type: 'bool'
 111     """
 112     intExpr = intExpr or int_expr
 113     array_expr = Forward()
 114
 115     def count_field_parse_action(s, l, t):
 116         nonlocal array_expr
 117         n = t[0]
 118         array_expr <<= (expr * n) if n else Empty()
 119         # clear list contents, but keep any named results
 120         del t[:]
 121
 122     if intExpr is None:
 123         intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))
 124     else:
 125         intExpr = intExpr.copy()
 126     intExpr.set_name("arrayLen")
 127     intExpr.add_parse_action(count_field_parse_action, call_during_try=True)
 128     return (intExpr + array_expr).set_name("(len) " + str(expr) + "...")
 129
 130
 131 def match_previous_literal(expr: ParserElement) -> ParserElement:
 132     """Helper to define an expression that is indirectly defined from
 133     the tokens matched in a previous expression, that is, it looks for
 134     a 'repeat' of a previous expression.  For example::
 135
 136         first = Word(nums)
 137         second = match_previous_literal(first)
 138         match_expr = first + ":" + second
 139
 140     will match ``"1:1"``, but not ``"1:2"``.  Because this
 141     matches a previous literal, will also match the leading
 142     ``"1:1"`` in ``"1:10"``. If this is not desired, use
 143     :class:`match_previous_expr`. Do *not* use with packrat parsing
 144     enabled.
 145     """
 146     rep = Forward()
 147
 148     def copy_token_to_repeater(s, l, t):
 149         if t:
 150             if len(t) == 1:
 151                 rep << t[0]
 152             else:
 153                 # flatten t tokens
 154                 tflat = _flatten(t.as_list())
 155                 rep << And(Literal(tt) for tt in tflat)
 156         else:
 157             rep << Empty()
 158
 159     expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
 160     rep.set_name("(prev) " + str(expr))
 161     return rep
 162
 163
 164 def match_previous_expr(expr: ParserElement) -> ParserElement:
 165     """Helper to define an expression that is indirectly defined from
 166     the tokens matched in a previous expression, that is, it looks for
 167     a 'repeat' of a previous expression.  For example::
 168
 169         first = Word(nums)
 170         second = match_previous_expr(first)
 171         match_expr = first + ":" + second
 172
 173     will match ``"1:1"``, but not ``"1:2"``.  Because this
 174     matches by expressions, will *not* match the leading ``"1:1"``
 175     in ``"1:10"``; the expressions are evaluated first, and then
 176     compared, so ``"1"`` is compared with ``"10"``. Do *not* use
 177     with packrat parsing enabled.
 178     """
 179     rep = Forward()
 180     e2 = expr.copy()
 181     rep <<= e2
 182
 183     def copy_token_to_repeater(s, l, t):
 184         matchTokens = _flatten(t.as_list())
 185
 186         def must_match_these_tokens(s, l, t):
 187             theseTokens = _flatten(t.as_list())
 188             if theseTokens != matchTokens:
 189                 raise ParseException(
 190                     s, l, "Expected {}, found{}".format(matchTokens, theseTokens)
 191                 )
 192
 193         rep.set_parse_action(must_match_these_tokens, callDuringTry=True)
 194
 195     expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
 196     rep.set_name("(prev) " + str(expr))
 197     return rep
 198
 199
 200 def one_of(
 201     strs: Union[typing.Iterable[str], str],
 202     caseless: bool = False,
 203     use_regex: bool = True,
 204     as_keyword: bool = False,
 205     *,
 206     useRegex: bool = True,
 207     asKeyword: bool = False,
 208 ) -> ParserElement:
 209     """Helper to quickly define a set of alternative :class:`Literal` s,
 210     and makes sure to do longest-first testing when there is a conflict,
 211     regardless of the input order, but returns
 212     a :class:`MatchFirst` for best performance.
 213
 214     Parameters:
 215
 216     - ``strs`` - a string of space-delimited literals, or a collection of
 217       string literals
 218     - ``caseless`` - treat all literals as caseless - (default= ``False``)
 219     - ``use_regex`` - as an optimization, will
 220       generate a :class:`Regex` object; otherwise, will generate
 221       a :class:`MatchFirst` object (if ``caseless=True`` or ``asKeyword=True``, or if
 222       creating a :class:`Regex` raises an exception) - (default= ``True``)
 223     - ``as_keyword`` - enforce :class:`Keyword`-style matching on the
 224       generated expressions - (default= ``False``)
 225     - ``asKeyword`` and ``useRegex`` are retained for pre-PEP8 compatibility,
 226       but will be removed in a future release
 227
 228     Example::
 229
 230         comp_oper = one_of("< = > <= >= !=")
 231         var = Word(alphas)
 232         number = Word(nums)
 233         term = var | number
 234         comparison_expr = term + comp_oper + term
 235         print(comparison_expr.search_string("B = 12  AA=23 B<=AA AA>12"))
 236
 237     prints::
 238
 239         [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
 240     """
 241     asKeyword = asKeyword or as_keyword
 242     useRegex = useRegex and use_regex
 243
 244     if (
 245         isinstance(caseless, str_type)
 246         and __diag__.warn_on_multiple_string_args_to_oneof
 247     ):
 248         warnings.warn(
 249             "More than one string argument passed to one_of, pass"
 250             " choices as a list or space-delimited string",
 251             stacklevel=2,
 252         )
 253
 254     if caseless:
 255         isequal = lambda a, b: a.upper() == b.upper()
 256         masks = lambda a, b: b.upper().startswith(a.upper())
 257         parseElementClass = CaselessKeyword if asKeyword else CaselessLiteral
 258     else:
 259         isequal = lambda a, b: a == b
 260         masks = lambda a, b: b.startswith(a)
 261         parseElementClass = Keyword if asKeyword else Literal
 262
 263     symbols: List[str] = []
 264     if isinstance(strs, str_type):
 265         symbols = strs.split()
 266     elif isinstance(strs, Iterable):
 267         symbols = list(strs)
 268     else:
 269         raise TypeError("Invalid argument to one_of, expected string or iterable")
 270     if not symbols:
 271         return NoMatch()
 272
 273     # reorder given symbols to take care to avoid masking longer choices with shorter ones
 274     # (but only if the given symbols are not just single characters)
 275     if any(len(sym) > 1 for sym in symbols):
 276         i = 0
 277         while i < len(symbols) - 1:
 278             cur = symbols[i]
 279             for j, other in enumerate(symbols[i + 1 :]):
 280                 if isequal(other, cur):
 281                     del symbols[i + j + 1]
 282                     break
 283                 elif masks(cur, other):
 284                     del symbols[i + j + 1]
 285                     symbols.insert(i, other)
 286                     break
 287             else:
 288                 i += 1
 289
 290     if useRegex:
 291         re_flags: int = re.IGNORECASE if caseless else 0
 292
 293         try:
 294             if all(len(sym) == 1 for sym in symbols):
 295                 # symbols are just single characters, create range regex pattern
 296                 patt = "[{}]".format(
 297                     "".join(_escape_regex_range_chars(sym) for sym in symbols)
 298                 )
 299             else:
 300                 patt = "|".join(re.escape(sym) for sym in symbols)
 301
 302             # wrap with \b word break markers if defining as keywords
 303             if asKeyword:
 304                 patt = r"\b(?:{})\b".format(patt)
 305
 306             ret = Regex(patt, flags=re_flags).set_name(" | ".join(symbols))
 307
 308             if caseless:
 309                 # add parse action to return symbols as specified, not in random
 310                 # casing as found in input string
 311                 symbol_map = {sym.lower(): sym for sym in symbols}
 312                 ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])
 313
 314             return ret
 315
 316         except re.error:
 317             warnings.warn(
 318                 "Exception creating Regex for one_of, building MatchFirst", stacklevel=2
 319             )
 320
 321     # last resort, just use MatchFirst
 322     return MatchFirst(parseElementClass(sym) for sym in symbols).set_name(
 323         " | ".join(symbols)
 324     )
 325
 326
 327 def dict_of(key: ParserElement, value: ParserElement) -> ParserElement:
 328     """Helper to easily and clearly define a dictionary by specifying
 329     the respective patterns for the key and value.  Takes care of
 330     defining the :class:`Dict`, :class:`ZeroOrMore`, and
 331     :class:`Group` tokens in the proper order.  The key pattern
 332     can include delimiting markers or punctuation, as long as they are
 333     suppressed, thereby leaving the significant key text.  The value
 334     pattern can include named results, so that the :class:`Dict` results
 335     can include named token fields.
 336
 337     Example::
 338
 339         text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
 340         attr_expr = (label + Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join))
 341         print(attr_expr[1, ...].parse_string(text).dump())
 342
 343         attr_label = label
 344         attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join)
 345
 346         # similar to Dict, but simpler call format
 347         result = dict_of(attr_label, attr_value).parse_string(text)
 348         print(result.dump())
 349         print(result['shape'])
 350         print(result.shape)  # object attribute access works too
 351         print(result.as_dict())
 352
 353     prints::
 354
 355         [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
 356         - color: 'light blue'
 357         - posn: 'upper left'
 358         - shape: 'SQUARE'
 359         - texture: 'burlap'
 360         SQUARE
 361         SQUARE
 362         {'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'}
 363     """
 364     return Dict(OneOrMore(Group(key + value)))
 365
 366
 367 def original_text_for(
 368     expr: ParserElement, as_string: bool = True, *, asString: bool = True
 369 ) -> ParserElement:
 370     """Helper to return the original, untokenized text for a given
 371     expression.  Useful to restore the parsed fields of an HTML start
 372     tag into the raw tag text itself, or to revert separate tokens with
 373     intervening whitespace back to the original matching input text. By
 374     default, returns astring containing the original parsed text.
 375
 376     If the optional ``as_string`` argument is passed as
 377     ``False``, then the return value is
 378     a :class:`ParseResults` containing any results names that
 379     were originally matched, and a single token containing the original
 380     matched text from the input string.  So if the expression passed to
 381     :class:`original_text_for` contains expressions with defined
 382     results names, you must set ``as_string`` to ``False`` if you
 383     want to preserve those results name values.
 384
 385     The ``asString`` pre-PEP8 argument is retained for compatibility,
 386     but will be removed in a future release.
 387
 388     Example::
 389
 390         src = "this is test <b> bold <i>text</i> </b> normal text "
 391         for tag in ("b", "i"):
 392             opener, closer = make_html_tags(tag)
 393             patt = original_text_for(opener + SkipTo(closer) + closer)
 394             print(patt.search_string(src)[0])
 395
 396     prints::
 397
 398         ['<b> bold <i>text</i> </b>']
 399         ['<i>text</i>']
 400     """
 401     asString = asString and as_string
 402
 403     locMarker = Empty().set_parse_action(lambda s, loc, t: loc)
 404     endlocMarker = locMarker.copy()
 405     endlocMarker.callPreparse = False
 406     matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
 407     if asString:
 408         extractText = lambda s, l, t: s[t._original_start : t._original_end]
 409     else:
 410
 411         def extractText(s, l, t):
 412             t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]
 413
 414     matchExpr.set_parse_action(extractText)
 415     matchExpr.ignoreExprs = expr.ignoreExprs
 416     matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)
 417     return matchExpr
 418
 419
 420 def ungroup(expr: ParserElement) -> ParserElement:
 421     """Helper to undo pyparsing's default grouping of And expressions,
 422     even if all but one are non-empty.
 423     """
 424     return TokenConverter(expr).add_parse_action(lambda t: t[0])
 425
 426
 427 def locatedExpr(expr: ParserElement) -> ParserElement:
 428     """
 429     (DEPRECATED - future code should use the Located class)
 430     Helper to decorate a returned token with its starting and ending
 431     locations in the input string.
 432
 433     This helper adds the following results names:
 434
 435     - ``locn_start`` - location where matched expression begins
 436     - ``locn_end`` - location where matched expression ends
 437     - ``value`` - the actual parsed results
 438
 439     Be careful if the input text contains ``<TAB>`` characters, you
 440     may want to call :class:`ParserElement.parseWithTabs`
 441
 442     Example::
 443
 444         wd = Word(alphas)
 445         for match in locatedExpr(wd).searchString("ljsdf123lksdjjf123lkkjj1222"):
 446             print(match)
 447
 448     prints::
 449
 450         [[0, 'ljsdf', 5]]
 451         [[8, 'lksdjjf', 15]]
 452         [[18, 'lkkjj', 23]]
 453     """
 454     locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
 455     return Group(
 456         locator("locn_start")
 457         + expr("value")
 458         + locator.copy().leaveWhitespace()("locn_end")
 459     )
 460
 461
 462 def nested_expr(
 463     opener: Union[str, ParserElement] = "(",
 464     closer: Union[str, ParserElement] = ")",
 465     content: typing.Optional[ParserElement] = None,
 466     ignore_expr: ParserElement = quoted_string(),
 467     *,
 468     ignoreExpr: ParserElement = quoted_string(),
 469 ) -> ParserElement:
 470     """Helper method for defining nested lists enclosed in opening and
 471     closing delimiters (``"("`` and ``")"`` are the default).
 472
 473     Parameters:
 474     - ``opener`` - opening character for a nested list
 475       (default= ``"("``); can also be a pyparsing expression
 476     - ``closer`` - closing character for a nested list
 477       (default= ``")"``); can also be a pyparsing expression
 478     - ``content`` - expression for items within the nested lists
 479       (default= ``None``)
 480     - ``ignore_expr`` - expression for ignoring opening and closing delimiters
 481       (default= :class:`quoted_string`)
 482     - ``ignoreExpr`` - this pre-PEP8 argument is retained for compatibility
 483       but will be removed in a future release
 484
 485     If an expression is not provided for the content argument, the
 486     nested expression will capture all whitespace-delimited content
 487     between delimiters as a list of separate values.
 488
 489     Use the ``ignore_expr`` argument to define expressions that may
 490     contain opening or closing characters that should not be treated as
 491     opening or closing characters for nesting, such as quoted_string or
 492     a comment expression.  Specify multiple expressions using an
 493     :class:`Or` or :class:`MatchFirst`. The default is
 494     :class:`quoted_string`, but if no expressions are to be ignored, then
 495     pass ``None`` for this argument.
 496
 497     Example::
 498
 499         data_type = one_of("void int short long char float double")
 500         decl_data_type = Combine(data_type + Opt(Word('*')))
 501         ident = Word(alphas+'_', alphanums+'_')
 502         number = pyparsing_common.number
 503         arg = Group(decl_data_type + ident)
 504         LPAR, RPAR = map(Suppress, "()")
 505
 506         code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
 507
 508         c_function = (decl_data_type("type")
 509                       + ident("name")
 510                       + LPAR + Opt(delimited_list(arg), [])("args") + RPAR
 511                       + code_body("body"))
 512         c_function.ignore(c_style_comment)
 513
 514         source_code = '''
 515             int is_odd(int x) {
 516                 return (x%2);
 517             }
 518
 519             int dec_to_hex(char hchar) {
 520                 if (hchar >= '0' && hchar <= '9') {
 521                     return (ord(hchar)-ord('0'));
 522                 } else {
 523                     return (10+ord(hchar)-ord('A'));
 524                 }
 525             }
 526         '''
 527         for func in c_function.search_string(source_code):
 528             print("%(name)s (%(type)s) args: %(args)s" % func)
 529
 530
 531     prints::
 532
 533         is_odd (int) args: [['int', 'x']]
 534         dec_to_hex (int) args: [['char', 'hchar']]
 535     """
 536     if ignoreExpr != ignore_expr:
 537         ignoreExpr = ignore_expr if ignoreExpr == quoted_string() else ignoreExpr
 538     if opener == closer:
 539         raise ValueError("opening and closing strings cannot be the same")
 540     if content is None:
 541         if isinstance(opener, str_type) and isinstance(closer, str_type):
 542             if len(opener) == 1 and len(closer) == 1:
 543                 if ignoreExpr is not None:
 544                     content = Combine(
 545                         OneOrMore(
 546                             ~ignoreExpr
 547                             + CharsNotIn(
 548                                 opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
 549                                 exact=1,
 550                             )
 551                         )
 552                     ).set_parse_action(lambda t: t[0].strip())
 553                 else:
 554                     content = empty.copy() + CharsNotIn(
 555                         opener + closer + ParserElement.DEFAULT_WHITE_CHARS
 556                     ).set_parse_action(lambda t: t[0].strip())
 557             else:
 558                 if ignoreExpr is not None:
 559                     content = Combine(
 560                         OneOrMore(
 561                             ~ignoreExpr
 562                             + ~Literal(opener)
 563                             + ~Literal(closer)
 564                             + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
 565                         )
 566                     ).set_parse_action(lambda t: t[0].strip())
 567                 else:
 568                     content = Combine(
 569                         OneOrMore(
 570                             ~Literal(opener)
 571                             + ~Literal(closer)
 572                             + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
 573                         )
 574                     ).set_parse_action(lambda t: t[0].strip())
 575         else:
 576             raise ValueError(
 577                 "opening and closing arguments must be strings if no content expression is given"
 578             )
 579     ret = Forward()
 580     if ignoreExpr is not None:
 581         ret <<= Group(
 582             Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer)
 583         )
 584     else:
 585         ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
 586     ret.set_name("nested %s%s expression" % (opener, closer))
 587     return ret
 588
 589
 590 def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
 591     """Internal helper to construct opening and closing tag expressions, given a tag name"""
 592     if isinstance(tagStr, str_type):
 593         resname = tagStr
 594         tagStr = Keyword(tagStr, caseless=not xml)
 595     else:
 596         resname = tagStr.name
 597
 598     tagAttrName = Word(alphas, alphanums + "_-:")
 599     if xml:
 600         tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
 601         openTag = (
 602             suppress_LT
 603             + tagStr("tag")
 604             + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
 605             + Opt("/", default=[False])("empty").set_parse_action(
 606                 lambda s, l, t: t[0] == "/"
 607             )
 608             + suppress_GT
 609         )
 610     else:
 611         tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
 612             printables, exclude_chars=">"
 613         )
 614         openTag = (
 615             suppress_LT
 616             + tagStr("tag")
 617             + Dict(
 618                 ZeroOrMore(
 619                     Group(
 620                         tagAttrName.set_parse_action(lambda t: t[0].lower())
 621                         + Opt(Suppress("=") + tagAttrValue)
 622                     )
 623                 )
 624             )
 625             + Opt("/", default=[False])("empty").set_parse_action(
 626                 lambda s, l, t: t[0] == "/"
 627             )
 628             + suppress_GT
 629         )
 630     closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)
 631
 632     openTag.set_name("<%s>" % resname)
 633     # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels
 634     openTag.add_parse_action(
 635         lambda t: t.__setitem__(
 636             "start" + "".join(resname.replace(":", " ").title().split()), t.copy()
 637         )
 638     )
 639     closeTag = closeTag(
 640         "end" + "".join(resname.replace(":", " ").title().split())
 641     ).set_name("</%s>" % resname)
 642     openTag.tag = resname
 643     closeTag.tag = resname
 644     openTag.tag_body = SkipTo(closeTag())
 645     return openTag, closeTag
 646
 647
 648 def make_html_tags(
 649     tag_str: Union[str, ParserElement]
 650 ) -> Tuple[ParserElement, ParserElement]:
 651     """Helper to construct opening and closing tag expressions for HTML,
 652     given a tag name. Matches tags in either upper or lower case,
 653     attributes with namespaces and with quoted or unquoted values.
 654
 655     Example::
 656
 657         text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
 658         # make_html_tags returns pyparsing expressions for the opening and
 659         # closing tags as a 2-tuple
 660         a, a_end = make_html_tags("A")
 661         link_expr = a + SkipTo(a_end)("link_text") + a_end
 662
 663         for link in link_expr.search_string(text):
 664             # attributes in the <A> tag (like "href" shown here) are
 665             # also accessible as named results
 666             print(link.link_text, '->', link.href)
 667
 668     prints::
 669
 670         pyparsing -> https://github.com/pyparsing/pyparsing/wiki
 671     """
 672     return _makeTags(tag_str, False)
 673
 674
 675 def make_xml_tags(
 676     tag_str: Union[str, ParserElement]
 677 ) -> Tuple[ParserElement, ParserElement]:
 678     """Helper to construct opening and closing tag expressions for XML,
 679     given a tag name. Matches tags only in the given upper/lower case.
 680
 681     Example: similar to :class:`make_html_tags`
 682     """
 683     return _makeTags(tag_str, True)
 684
 685
 686 any_open_tag: ParserElement
 687 any_close_tag: ParserElement
 688 any_open_tag, any_close_tag = make_html_tags(
 689     Word(alphas, alphanums + "_:").set_name("any tag")
 690 )
 691
 692 _htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
 693 common_html_entity = Regex("&(?P<entity>" + "|".join(_htmlEntityMap) + ");").set_name(
 694     "common HTML entity"
 695 )
 696
 697
 698 def replace_html_entity(t):
 699     """Helper parser action to replace common HTML entities with their special characters"""
 700     return _htmlEntityMap.get(t.entity)
 701
 702
 703 class OpAssoc(Enum):
 704     LEFT = 1
 705     RIGHT = 2
 706
 707
 708 InfixNotationOperatorArgType = Union[
 709     ParserElement, str, Tuple[Union[ParserElement, str], Union[ParserElement, str]]
 710 ]
 711 InfixNotationOperatorSpec = Union[
 712     Tuple[
 713         InfixNotationOperatorArgType,
 714         int,
 715         OpAssoc,
 716         typing.Optional[ParseAction],
 717     ],
 718     Tuple[
 719         InfixNotationOperatorArgType,
 720         int,
 721         OpAssoc,
 722     ],
 723 ]
 724
 725
 726 def infix_notation(
 727     base_expr: ParserElement,
 728     op_list: List[InfixNotationOperatorSpec],
 729     lpar: Union[str, ParserElement] = Suppress("("),
 730     rpar: Union[str, ParserElement] = Suppress(")"),
 731 ) -> ParserElement:
 732     """Helper method for constructing grammars of expressions made up of
 733     operators working in a precedence hierarchy.  Operators may be unary
 734     or binary, left- or right-associative.  Parse actions can also be
 735     attached to operator expressions. The generated parser will also
 736     recognize the use of parentheses to override operator precedences
 737     (see example below).
 738
 739     Note: if you define a deep operator list, you may see performance
 740     issues when using infix_notation. See
 741     :class:`ParserElement.enable_packrat` for a mechanism to potentially
 742     improve your parser performance.
 743
 744     Parameters:
 745     - ``base_expr`` - expression representing the most basic operand to
 746       be used in the expression
 747     - ``op_list`` - list of tuples, one for each operator precedence level
 748       in the expression grammar; each tuple is of the form ``(op_expr,
 749       num_operands, right_left_assoc, (optional)parse_action)``, where:
 750
 751       - ``op_expr`` is the pyparsing expression for the operator; may also
 752         be a string, which will be converted to a Literal; if ``num_operands``
 753         is 3, ``op_expr`` is a tuple of two expressions, for the two
 754         operators separating the 3 terms
 755       - ``num_operands`` is the number of terms for this operator (must be 1,
 756         2, or 3)
 757       - ``right_left_assoc`` is the indicator whether the operator is right
 758         or left associative, using the pyparsing-defined constants
 759         ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
 760       - ``parse_action`` is the parse action to be associated with
 761         expressions matching this operator expression (the parse action
 762         tuple member may be omitted); if the parse action is passed
 763         a tuple or list of functions, this is equivalent to calling
 764         ``set_parse_action(*fn)``
 765         (:class:`ParserElement.set_parse_action`)
 766     - ``lpar`` - expression for matching left-parentheses; if passed as a
 767       str, then will be parsed as Suppress(lpar). If lpar is passed as
 768       an expression (such as ``Literal('(')``), then it will be kept in
 769       the parsed results, and grouped with them. (default= ``Suppress('(')``)
 770     - ``rpar`` - expression for matching right-parentheses; if passed as a
 771       str, then will be parsed as Suppress(rpar). If rpar is passed as
 772       an expression (such as ``Literal(')')``), then it will be kept in
 773       the parsed results, and grouped with them. (default= ``Suppress(')')``)
 774
 775     Example::
 776
 777         # simple example of four-function arithmetic with ints and
 778         # variable names
 779         integer = pyparsing_common.signed_integer
 780         varname = pyparsing_common.identifier
 781
 782         arith_expr = infix_notation(integer | varname,
 783             [
 784             ('-', 1, OpAssoc.RIGHT),
 785             (one_of('* /'), 2, OpAssoc.LEFT),
 786             (one_of('+ -'), 2, OpAssoc.LEFT),
 787             ])
 788
 789         arith_expr.run_tests('''
 790             5+3*6
 791             (5+3)*6
 792             -2--11
 793             ''', full_dump=False)
 794
 795     prints::
 796
 797         5+3*6
 798         [[5, '+', [3, '*', 6]]]
 799
 800         (5+3)*6
 801         [[[5, '+', 3], '*', 6]]
 802
 803         -2--11
 804         [[['-', 2], '-', ['-', 11]]]
 805     """
 806     # captive version of FollowedBy that does not do parse actions or capture results names
 807     class _FB(FollowedBy):
 808         def parseImpl(self, instring, loc, doActions=True):
 809             self.expr.try_parse(instring, loc)
 810             return loc, []
 811
 812     _FB.__name__ = "FollowedBy>"
 813
 814     ret = Forward()
 815     if isinstance(lpar, str):
 816         lpar = Suppress(lpar)
 817     if isinstance(rpar, str):
 818         rpar = Suppress(rpar)
 819
 820     # if lpar and rpar are not suppressed, wrap in group
 821     if not (isinstance(rpar, Suppress) and isinstance(rpar, Suppress)):
 822         lastExpr = base_expr | Group(lpar + ret + rpar)
 823     else:
 824         lastExpr = base_expr | (lpar + ret + rpar)
 825
 826     for i, operDef in enumerate(op_list):
 827         opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4]
 828         if isinstance(opExpr, str_type):
 829             opExpr = ParserElement._literalStringClass(opExpr)
 830         if arity == 3:
 831             if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
 832                 raise ValueError(
 833                     "if numterms=3, opExpr must be a tuple or list of two expressions"
 834                 )
 835             opExpr1, opExpr2 = opExpr
 836             term_name = "{}{} term".format(opExpr1, opExpr2)
 837         else:
 838             term_name = "{} term".format(opExpr)
 839
 840         if not 1 <= arity <= 3:
 841             raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
 842
 843         if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
 844             raise ValueError("operator must indicate right or left associativity")
 845
 846         thisExpr: Forward = Forward().set_name(term_name)
 847         if rightLeftAssoc is OpAssoc.LEFT:
 848             if arity == 1:
 849                 matchExpr = _FB(lastExpr + opExpr) + Group(lastExpr + opExpr[1, ...])
 850             elif arity == 2:
 851                 if opExpr is not None:
 852                     matchExpr = _FB(lastExpr + opExpr + lastExpr) + Group(
 853                         lastExpr + (opExpr + lastExpr)[1, ...]
 854                     )
 855                 else:
 856                     matchExpr = _FB(lastExpr + lastExpr) + Group(lastExpr[2, ...])
 857             elif arity == 3:
 858                 matchExpr = _FB(
 859                     lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
 860                 ) + Group(lastExpr + OneOrMore(opExpr1 + lastExpr + opExpr2 + lastExpr))
 861         elif rightLeftAssoc is OpAssoc.RIGHT:
 862             if arity == 1:
 863                 # try to avoid LR with this extra test
 864                 if not isinstance(opExpr, Opt):
 865                     opExpr = Opt(opExpr)
 866                 matchExpr = _FB(opExpr.expr + thisExpr) + Group(opExpr + thisExpr)
 867             elif arity == 2:
 868                 if opExpr is not None:
 869                     matchExpr = _FB(lastExpr + opExpr + thisExpr) + Group(
 870                         lastExpr + (opExpr + thisExpr)[1, ...]
 871                     )
 872                 else:
 873                     matchExpr = _FB(lastExpr + thisExpr) + Group(
 874                         lastExpr + thisExpr[1, ...]
 875                     )
 876             elif arity == 3:
 877                 matchExpr = _FB(
 878                     lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
 879                 ) + Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
 880         if pa:
 881             if isinstance(pa, (tuple, list)):
 882                 matchExpr.set_parse_action(*pa)
 883             else:
 884                 matchExpr.set_parse_action(pa)
 885         thisExpr <<= (matchExpr | lastExpr).setName(term_name)
 886         lastExpr = thisExpr
 887     ret <<= lastExpr
 888     return ret
 889
 890
 891 def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
 892     """
 893     (DEPRECATED - use IndentedBlock class instead)
 894     Helper method for defining space-delimited indentation blocks,
 895     such as those used to define block statements in Python source code.
 896
 897     Parameters:
 898
 899     - ``blockStatementExpr`` - expression defining syntax of statement that
 900       is repeated within the indented block
 901     - ``indentStack`` - list created by caller to manage indentation stack
 902       (multiple ``statementWithIndentedBlock`` expressions within a single
 903       grammar should share a common ``indentStack``)
 904     - ``indent`` - boolean indicating whether block must be indented beyond
 905       the current level; set to ``False`` for block of left-most statements
 906       (default= ``True``)
 907
 908     A valid block must contain at least one ``blockStatement``.
 909
 910     (Note that indentedBlock uses internal parse actions which make it
 911     incompatible with packrat parsing.)
 912
 913     Example::
 914
 915         data = '''
 916         def A(z):
 917           A1
 918           B = 100
 919           G = A2
 920           A2
 921           A3
 922         B
 923         def BB(a,b,c):
 924           BB1
 925           def BBA():
 926             bba1
 927             bba2
 928             bba3
 929         C
 930         D
 931         def spam(x,y):
 932              def eggs(z):
 933                  pass
 934         '''
 935
 936
 937         indentStack = [1]
 938         stmt = Forward()
 939
 940         identifier = Word(alphas, alphanums)
 941         funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
 942         func_body = indentedBlock(stmt, indentStack)
 943         funcDef = Group(funcDecl + func_body)
 944
 945         rvalue = Forward()
 946         funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
 947         rvalue << (funcCall | identifier | Word(nums))
 948         assignment = Group(identifier + "=" + rvalue)
 949         stmt << (funcDef | assignment | identifier)
 950
 951         module_body = stmt[1, ...]
 952
 953         parseTree = module_body.parseString(data)
 954         parseTree.pprint()
 955
 956     prints::
 957
 958         [['def',
 959           'A',
 960           ['(', 'z', ')'],
 961           ':',
 962           [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
 963          'B',
 964          ['def',
 965           'BB',
 966           ['(', 'a', 'b', 'c', ')'],
 967           ':',
 968           [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
 969          'C',
 970          'D',
 971          ['def',
 972           'spam',
 973           ['(', 'x', 'y', ')'],
 974           ':',
 975           [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
 976     """
 977     backup_stacks.append(indentStack[:])
 978
 979     def reset_stack():
 980         indentStack[:] = backup_stacks[-1]
 981
 982     def checkPeerIndent(s, l, t):
 983         if l >= len(s):
 984             return
 985         curCol = col(l, s)
 986         if curCol != indentStack[-1]:
 987             if curCol > indentStack[-1]:
 988                 raise ParseException(s, l, "illegal nesting")
 989             raise ParseException(s, l, "not a peer entry")
 990
 991     def checkSubIndent(s, l, t):
 992         curCol = col(l, s)
 993         if curCol > indentStack[-1]:
 994             indentStack.append(curCol)
 995         else:
 996             raise ParseException(s, l, "not a subentry")
 997
 998     def checkUnindent(s, l, t):
 999         if l >= len(s):
1000             return
1001         curCol = col(l, s)
1002         if not (indentStack and curCol in indentStack):
1003             raise ParseException(s, l, "not an unindent")
1004         if curCol < indentStack[-1]:
1005             indentStack.pop()
1006
1007     NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
1008     INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
1009     PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
1010     UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
1011     if indent:
1012         smExpr = Group(
1013             Opt(NL)
1014             + INDENT
1015             + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
1016             + UNDENT
1017         )
1018     else:
1019         smExpr = Group(
1020             Opt(NL)
1021             + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
1022             + Opt(UNDENT)
1023         )
1024
1025     # add a parse action to remove backup_stack from list of backups
1026     smExpr.add_parse_action(
1027         lambda: backup_stacks.pop(-1) and None if backup_stacks else None
1028     )
1029     smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
1030     blockStatementExpr.ignore(_bslash + LineEnd())
1031     return smExpr.set_name("indented block")
1032
1033
1034 # it's easy to get these comment structures wrong - they're very common, so may as well make them available
1035 c_style_comment = Combine(Regex(r"/\*(?:[^*]|\*(?!/))*") + "*/").set_name(
1036     "C style comment"
1037 )
1038 "Comment of the form ``/* ... */``"
1039
1040 html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment")
1041 "Comment of the form ``<!-- ... -->``"
1042
1043 rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
1044 dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
1045 "Comment of the form ``// ... (to end of line)``"
1046
1047 cpp_style_comment = Combine(
1048     Regex(r"/\*(?:[^*]|\*(?!/))*") + "*/" | dbl_slash_comment
1049 ).set_name("C++ style comment")
1050 "Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
1051
1052 java_style_comment = cpp_style_comment
1053 "Same as :class:`cpp_style_comment`"
1054
1055 python_style_comment = Regex(r"#.*").set_name("Python style comment")
1056 "Comment of the form ``# ... (to end of line)``"
1057
1058
1059 # build list of built-in expressions, for future reference if a global default value
1060 # gets updated
1061 _builtin_exprs: List[ParserElement] = [
1062     v for v in vars().values() if isinstance(v, ParserElement)
1063 ]
1064
1065
1066 # pre-PEP8 compatible names
1067 delimitedList = delimited_list
1068 countedArray = counted_array
1069 matchPreviousLiteral = match_previous_literal
1070 matchPreviousExpr = match_previous_expr
1071 oneOf = one_of
1072 dictOf = dict_of
1073 originalTextFor = original_text_for
1074 nestedExpr = nested_expr
1075 makeHTMLTags = make_html_tags
1076 makeXMLTags = make_xml_tags
1077 anyOpenTag, anyCloseTag = any_open_tag, any_close_tag
1078 commonHTMLEntity = common_html_entity
1079 replaceHTMLEntity = replace_html_entity
1080 opAssoc = OpAssoc
1081 infixNotation = infix_notation
1082 cStyleComment = c_style_comment
1083 htmlComment = html_comment
1084 restOfLine = rest_of_line
1085 dblSlashComment = dbl_slash_comment
1086 cppStyleComment = cpp_style_comment
1087 javaStyleComment = java_style_comment
1088 pythonStyleComment = python_style_comment