summaryrefslogtreecommitdiffstats
path: root/tools/lib/python/kdoc/c_lex.py
diff options
context:
space:
mode:
Diffstat (limited to 'tools/lib/python/kdoc/c_lex.py')
-rw-r--r--tools/lib/python/kdoc/c_lex.py662
1 files changed, 662 insertions, 0 deletions
diff --git a/tools/lib/python/kdoc/c_lex.py b/tools/lib/python/kdoc/c_lex.py
new file mode 100644
index 000000000000..cb95f5172448
--- /dev/null
+++ b/tools/lib/python/kdoc/c_lex.py
@@ -0,0 +1,662 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: GPL-2.0
+# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
+
+"""
+Regular expression ancillary classes.
+
+Those help caching regular expressions and do matching for kernel-doc.
+
+Please notice that the code here may rise exceptions to indicate bad
+usage inside kdoc to indicate problems at the replace pattern.
+
+Other errors are logged via log instance.
+"""
+
+import logging
+import re
+
+from copy import copy
+
+from .kdoc_re import KernRe
+
+log = logging.getLogger(__name__)
+
+def tokenizer_set_log(logger, prefix = ""):
+ """
+ Replace the module‑level logger with a LoggerAdapter that
+ prepends *prefix* to every message.
+ """
+ global log
+
+ class PrefixAdapter(logging.LoggerAdapter):
+ """
+ Ancillary class to set prefix on all message logs.
+ """
+ def process(self, msg, kwargs):
+ return f"{prefix}{msg}", kwargs
+
+ # Wrap the provided logger in our adapter
+ log = PrefixAdapter(logger, {"prefix": prefix})
+
+class CToken():
+ """
+ Data class to define a C token.
+ """
+
+ # Tokens that can be used by the parser. Works like an C enum.
+
+ COMMENT = 0 #: A standard C or C99 comment, including delimiter.
+ STRING = 1 #: A string, including quotation marks.
+ CHAR = 2 #: A character, including apostophes.
+ NUMBER = 3 #: A number.
+ PUNC = 4 #: A puntuation mark: / ``,`` / ``.``.
+ BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``.
+ END = 6 #: A end character: ``}`` / ``]`` / ``)``.
+ CPP = 7 #: A preprocessor macro.
+ HASH = 8 #: The hash character - useful to handle other macros.
+ OP = 9 #: A C operator (add, subtract, ...).
+ STRUCT = 10 #: A ``struct`` keyword.
+ UNION = 11 #: An ``union`` keyword.
+ ENUM = 12 #: A ``struct`` keyword.
+ TYPEDEF = 13 #: A ``typedef`` keyword.
+ NAME = 14 #: A name. Can be an ID or a type.
+ SPACE = 15 #: Any space characters, including new lines
+ ENDSTMT = 16 #: End of an statement (``;``).
+
+ BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns.
+
+ MISMATCH = 255 #: an error indicator: should never happen in practice.
+
+ # Dict to convert from an enum interger into a string.
+ _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
+
+ # Dict to convert from string to an enum-like integer value.
+ _name_to_val = {k: v for v, k in _name_by_val.items()}
+
+ @staticmethod
+ def to_name(val):
+ """Convert from an integer value from CToken enum into a string"""
+
+ return CToken._name_by_val.get(val, f"UNKNOWN({val})")
+
+ @staticmethod
+ def from_name(name):
+ """Convert a string into a CToken enum value"""
+ if name in CToken._name_to_val:
+ return CToken._name_to_val[name]
+
+ return CToken.MISMATCH
+
+
+ def __init__(self, kind, value=None, pos=0,
+ brace_level=0, paren_level=0, bracket_level=0):
+ self.kind = kind
+ self.value = value
+ self.pos = pos
+ self.level = (bracket_level, paren_level, brace_level)
+
+ def __repr__(self):
+ name = self.to_name(self.kind)
+ if isinstance(self.value, str):
+ value = '"' + self.value + '"'
+ else:
+ value = self.value
+
+ return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
+
+#: Regexes to parse C code, transforming it into tokens.
+RE_SCANNER_LIST = [
+ #
+ # Note that \s\S is different than .*, as it also catches \n
+ #
+ (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
+
+ (CToken.STRING, r'"(?:\\.|[^"\\])*"'),
+ (CToken.CHAR, r"'(?:\\.|[^'\\])'"),
+
+ (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
+ r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
+
+ (CToken.ENDSTMT, r"(?:\s+;|;)"),
+
+ (CToken.PUNC, r"[,\.]"),
+
+ (CToken.BEGIN, r"[\[\(\{]"),
+
+ (CToken.END, r"[\]\)\}]"),
+
+ (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
+
+ (CToken.HASH, r"#"),
+
+ (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
+ r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
+
+ (CToken.STRUCT, r"\bstruct\b"),
+ (CToken.UNION, r"\bunion\b"),
+ (CToken.ENUM, r"\benum\b"),
+ (CToken.TYPEDEF, r"\btypedef\b"),
+
+ (CToken.NAME, r"[A-Za-z_]\w*"),
+
+ (CToken.SPACE, r"\s+"),
+
+ (CToken.BACKREF, r"\\\d+"),
+
+ (CToken.MISMATCH,r"."),
+]
+
+def fill_re_scanner(token_list):
+ """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
+ re_tokens = []
+
+ for kind, pattern in token_list:
+ name = CToken.to_name(kind)
+ re_tokens.append(f"(?P<{name}>{pattern})")
+
+ return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
+
+#: Handle C continuation lines.
+RE_CONT = KernRe(r"\\\n")
+
+RE_COMMENT_START = KernRe(r'/\*\s*')
+
+#: tokenizer regex. Will be filled at the first CTokenizer usage.
+RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
+
+
+class CTokenizer():
+ """
+ Scan C statements and definitions and produce tokens.
+
+ When converted to string, it drops comments and handle public/private
+ values, respecting depth.
+ """
+
+ # This class is inspired and follows the basic concepts of:
+ # https://docs.python.org/3/library/re.html#writing-a-tokenizer
+
+ def __init__(self, source=None):
+ """
+ Create a regular expression to handle RE_SCANNER_LIST.
+
+ While I generally don't like using regex group naming via:
+ (?P<name>...)
+
+ in this particular case, it makes sense, as we can pick the name
+ when matching a code via RE_SCANNER.
+ """
+
+ #
+ # Store logger to allow parser classes to re-use it
+ #
+ global log
+ self.log = log
+
+ self.tokens = []
+
+ if not source:
+ return
+
+ if isinstance(source, list):
+ self.tokens = source
+ return
+
+ #
+ # While we could just use _tokenize directly via interator,
+ # As we'll need to use the tokenizer several times inside kernel-doc
+ # to handle macro transforms, cache the results on a list, as
+ # re-using it is cheaper than having to parse everytime.
+ #
+ for tok in self._tokenize(source):
+ self.tokens.append(tok)
+
+ def _tokenize(self, source):
+ """
+ Iterator that parses ``source``, splitting it into tokens, as defined
+ at ``self.RE_SCANNER_LIST``.
+
+ The interactor returns a CToken class object.
+ """
+
+ # Handle continuation lines. Note that kdoc_parser already has a
+ # logic to do that. Still, let's keep it for completeness, as we might
+ # end re-using this tokenizer outsize kernel-doc some day - or we may
+ # eventually remove from there as a future cleanup.
+ source = RE_CONT.sub("", source)
+
+ brace_level = 0
+ paren_level = 0
+ bracket_level = 0
+
+ for match in RE_SCANNER.finditer(source):
+ kind = CToken.from_name(match.lastgroup)
+ pos = match.start()
+ value = match.group()
+
+ if kind == CToken.MISMATCH:
+ log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
+ elif kind == CToken.BEGIN:
+ if value == '(':
+ paren_level += 1
+ elif value == '[':
+ bracket_level += 1
+ else: # value == '{'
+ brace_level += 1
+
+ elif kind == CToken.END:
+ if value == ')' and paren_level > 0:
+ paren_level -= 1
+ elif value == ']' and bracket_level > 0:
+ bracket_level -= 1
+ elif brace_level > 0: # value == '}'
+ brace_level -= 1
+
+ yield CToken(kind, value, pos,
+ brace_level, paren_level, bracket_level)
+
+ def __str__(self):
+ out=""
+ show_stack = [True]
+
+ for i, tok in enumerate(self.tokens):
+ if tok.kind == CToken.BEGIN:
+ show_stack.append(show_stack[-1])
+
+ elif tok.kind == CToken.END:
+ prev = show_stack[-1]
+ if len(show_stack) > 1:
+ show_stack.pop()
+
+ if not prev and show_stack[-1]:
+ #
+ # Try to preserve indent
+ #
+ out += "\t" * (len(show_stack) - 1)
+
+ out += str(tok.value)
+ continue
+
+ elif tok.kind == CToken.COMMENT:
+ comment = RE_COMMENT_START.sub("", tok.value)
+
+ if comment.startswith("private:"):
+ show_stack[-1] = False
+ show = False
+ elif comment.startswith("public:"):
+ show_stack[-1] = True
+
+ continue
+
+ if not show_stack[-1]:
+ continue
+
+ if i < len(self.tokens) - 1:
+ next_tok = self.tokens[i + 1]
+
+ # Do some cleanups before ";"
+
+ if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
+ continue
+
+ if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
+ continue
+
+ out += str(tok.value)
+
+ return out
+
+
+class CTokenArgs:
+ """
+ Ancillary class to help using backrefs from sub matches.
+
+ If the highest backref contain a "+" at the last element,
+ the logic will be greedy, picking all other delims.
+
+ This is needed to parse struct_group macros with end with ``MEMBERS...``.
+ """
+ def __init__(self, sub_str):
+ self.sub_groups = set()
+ self.max_group = -1
+ self.greedy = None
+
+ for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
+ group = int(m.group(1))
+ if m.group(2) == "+":
+ if self.greedy and self.greedy != group:
+ raise ValueError("There are multiple greedy patterns!")
+ self.greedy = group
+
+ self.sub_groups.add(group)
+ self.max_group = max(self.max_group, group)
+
+ if self.greedy:
+ if self.greedy != self.max_group:
+ raise ValueError("Greedy pattern is not the last one!")
+
+ sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
+
+ self.sub_str = sub_str
+ self.sub_tokeninzer = CTokenizer(sub_str)
+
+ def groups(self, new_tokenizer):
+ r"""
+ Create replacement arguments for backrefs like:
+
+ ``\0``, ``\1``, ``\2``, ... ``\{number}``
+
+ It also accepts a ``+`` character to the highest backref, like
+ ``\4+``. When used, the backref will be greedy, picking all other
+ arguments afterwards.
+
+ The logic is smart enough to only go up to the maximum required
+ argument, even if there are more.
+
+ If there is a backref for an argument above the limit, it will
+ raise an exception. Please notice that, on C, square brackets
+ don't have any separator on it. Trying to use ``\1``..``\n`` for
+ brackets also raise an exception.
+ """
+
+ level = (0, 0, 0)
+
+ if self.max_group < 0:
+ return level, []
+
+ tokens = new_tokenizer.tokens
+
+ #
+ # Fill \0 with the full token contents
+ #
+ groups_list = [ [] ]
+
+ if 0 in self.sub_groups:
+ inner_level = 0
+
+ for i in range(0, len(tokens)):
+ tok = tokens[i]
+
+ if tok.kind == CToken.BEGIN:
+ inner_level += 1
+
+ #
+ # Discard first begin
+ #
+ if not groups_list[0]:
+ continue
+ elif tok.kind == CToken.END:
+ inner_level -= 1
+ if inner_level < 0:
+ break
+
+ if inner_level:
+ groups_list[0].append(tok)
+
+ if not self.max_group:
+ return level, groups_list
+
+ delim = None
+
+ #
+ # Ignore everything before BEGIN. The value of begin gives the
+ # delimiter to be used for the matches
+ #
+ for i in range(0, len(tokens)):
+ tok = tokens[i]
+ if tok.kind == CToken.BEGIN:
+ if tok.value == "{":
+ delim = ";"
+ elif tok.value == "(":
+ delim = ","
+ else:
+ self.log.error(fr"Can't handle \1..\n on {sub_str}")
+
+ level = tok.level
+ break
+
+ pos = 1
+ groups_list.append([])
+
+ inner_level = 0
+ for i in range(i + 1, len(tokens)):
+ tok = tokens[i]
+
+ if tok.kind == CToken.BEGIN:
+ inner_level += 1
+ if tok.kind == CToken.END:
+ inner_level -= 1
+ if inner_level < 0:
+ break
+
+ if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
+ pos += 1
+ if self.greedy and pos > self.max_group:
+ pos -= 1
+ else:
+ groups_list.append([])
+
+ if pos > self.max_group:
+ break
+
+ continue
+
+ groups_list[pos].append(tok)
+
+ if pos < self.max_group:
+ log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
+
+ return level, groups_list
+
+ def tokens(self, new_tokenizer):
+ level, groups = self.groups(new_tokenizer)
+
+ new = CTokenizer()
+
+ for tok in self.sub_tokeninzer.tokens:
+ if tok.kind == CToken.BACKREF:
+ group = int(tok.value[1:])
+
+ for group_tok in groups[group]:
+ new_tok = copy(group_tok)
+
+ new_level = [0, 0, 0]
+
+ for i in range(0, len(level)):
+ new_level[i] = new_tok.level[i] + level[i]
+
+ new_tok.level = tuple(new_level)
+
+ new.tokens += [ new_tok ]
+ else:
+ new.tokens += [ tok ]
+
+ return new.tokens
+
+
+class CMatch:
+ """
+ Finding nested delimiters is hard with regular expressions. It is
+ even harder on Python with its normal re module, as there are several
+ advanced regular expressions that are missing.
+
+ This is the case of this pattern::
+
+ '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
+
+ which is used to properly match open/close parentheses of the
+ string search STRUCT_GROUP(),
+
+ Add a class that counts pairs of delimiters, using it to match and
+ replace nested expressions.
+
+ The original approach was suggested by:
+
+ https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
+
+ Although I re-implemented it to make it more generic and match 3 types
+ of delimiters. The logic checks if delimiters are paired. If not, it
+ will ignore the search string.
+ """
+
+
+ def __init__(self, regex, delim="("):
+ self.regex = KernRe("^" + regex + r"\b")
+ self.start_delim = delim
+
+ def _search(self, tokenizer):
+ """
+ Finds paired blocks for a regex that ends with a delimiter.
+
+ The suggestion of using finditer to match pairs came from:
+ https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
+ but I ended using a different implementation to align all three types
+ of delimiters and seek for an initial regular expression.
+
+ The algorithm seeks for open/close paired delimiters and places them
+ into a stack, yielding a start/stop position of each match when the
+ stack is zeroed.
+
+ The algorithm should work fine for properly paired lines, but will
+ silently ignore end delimiters that precede a start delimiter.
+ This should be OK for kernel-doc parser, as unaligned delimiters
+ would cause compilation errors. So, we don't need to raise exceptions
+ to cover such issues.
+ """
+
+ start = None
+ started = False
+
+ import sys
+
+ stack = []
+
+ for i, tok in enumerate(tokenizer.tokens):
+ if start is None:
+ if tok.kind == CToken.NAME and self.regex.match(tok.value):
+ start = i
+ stack.append((start, tok.level))
+ started = False
+
+ continue
+
+ if not started:
+ if tok.kind == CToken.SPACE:
+ continue
+
+ if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
+ started = True
+ continue
+
+ # Name only token without BEGIN/END
+ if i > start:
+ i -= 1
+ yield start, i
+ start = None
+
+ if tok.kind == CToken.END and tok.level == stack[-1][1]:
+ start, level = stack.pop()
+
+ yield start, i
+ start = None
+
+ #
+ # If an END zeroing levels is not there, return remaining stuff
+ # This is meant to solve cases where the caller logic might be
+ # picking an incomplete block.
+ #
+ if start and stack:
+ if started:
+ s = str(tokenizer)
+ log.warning(f"can't find a final end at {s}")
+
+ yield start, len(tokenizer.tokens)
+
+ def search(self, source):
+ """
+ This is similar to re.search:
+
+ It matches a regex that it is followed by a delimiter,
+ returning occurrences only if all delimiters are paired.
+ """
+
+ if isinstance(source, CTokenizer):
+ tokenizer = source
+ is_token = True
+ else:
+ tokenizer = CTokenizer(source)
+ is_token = False
+
+ for start, end in self._search(tokenizer):
+ new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
+
+ if is_token:
+ yield new_tokenizer
+ else:
+ yield str(new_tokenizer)
+
+ def sub(self, sub_str, source, count=0):
+ """
+ This is similar to re.sub:
+
+ It matches a regex that it is followed by a delimiter,
+ replacing occurrences only if all delimiters are paired.
+
+ if the sub argument contains::
+
+ r'\0'
+
+ it will work just like re: it places there the matched paired data
+ with the delimiter stripped.
+
+ If count is different than zero, it will replace at most count
+ items.
+ """
+ if isinstance(source, CTokenizer):
+ is_token = True
+ tokenizer = source
+ else:
+ is_token = False
+ tokenizer = CTokenizer(source)
+
+ # Detect if sub_str contains sub arguments
+
+ args_match = CTokenArgs(sub_str)
+
+ new_tokenizer = CTokenizer()
+ pos = 0
+ n = 0
+
+ #
+ # NOTE: the code below doesn't consider overlays at sub.
+ # We may need to add some extra unit tests to check if those
+ # would cause problems. When replacing by "", this should not
+ # be a problem, but other transformations could be problematic
+ #
+ for start, end in self._search(tokenizer):
+ new_tokenizer.tokens += tokenizer.tokens[pos:start]
+
+ new = CTokenizer(tokenizer.tokens[start:end + 1])
+
+ new_tokenizer.tokens += args_match.tokens(new)
+
+ pos = end + 1
+
+ n += 1
+ if count and n >= count:
+ break
+
+ new_tokenizer.tokens += tokenizer.tokens[pos:]
+
+ if not is_token:
+ return str(new_tokenizer)
+
+ return new_tokenizer
+
+ def __repr__(self):
+ """
+ Returns a displayable version of the class init.
+ """
+
+ return f'CMatch("{self.regex.regex.pattern}")'