diff options
| author | Jonathan Corbet <corbet@lwn.net> | 2026-03-22 15:06:59 -0600 |
|---|---|---|
| committer | Jonathan Corbet <corbet@lwn.net> | 2026-03-22 15:06:59 -0600 |
| commit | 781171bec0650c00c642564afcb5cce57abda5bf (patch) | |
| tree | 31e8cf6d41b40bef9f432ebbe4a8180f32af9fbf /tools/lib/python/kdoc/xforms_lists.py | |
| parent | 6108c809f4fd9dbb1a138ba4326d645cc3113a8d (diff) | |
| parent | 7538df7a2d7d26428803cf8053476169a6d28659 (diff) | |
| download | linux-781171bec0650c00c642564afcb5cce57abda5bf.tar.gz linux-781171bec0650c00c642564afcb5cce57abda5bf.zip | |
Merge branch 'mauro' into docs-mw
Mauro says:
This patch series change how kdoc parser handles macro replacements.
Instead of heavily relying on regular expressions that can sometimes
be very complex, it uses a C lexical tokenizer. This ensures that
BEGIN/END blocks on functions and structs are properly handled,
even when nested.
Checking before/after the patch series, for both man pages and
rst only had:
- whitespace differences;
- struct_group macros now are shown as inner anonimous structs
as it should be.
Also, I didn't notice any relevant change on the documentation build
time. With that regards, right now, every time a CMatch replacement
rule takes in place, it does:
for each transform:
- tokenizes the source code;
- handle CMatch;
- convert tokens back to a string.
A possible optimization would be to do, instead:
- tokenizes source code;
- for each transform handle CMatch;
- convert tokens back to a string.
For now, I opted not do do it, because:
- too much changes on a single row;
- docs build time is taking ~3:30 minutes, which is
about the same time it ws taken before the changes;
- there is a very dirty hack inside function_xforms:
(KernRe(r"_noprof"), ""). This is meant to change
function prototypes instead of function arguments.
So, if ok for you, I would prefer to merge this one first. We can later
optimize kdoc_parser to avoid multiple token <-> string conversions.
-
One important aspect of this series is that it introduces unittests
for kernel-doc. I used it a lot during the development of this series,
to ensure that the changes I was doing were producing the expected
results. Tests are on two separate files that can be executed directly.
Alternatively, there is a run.py script that runs all of them (and
any other python script named tools/unittests/test_*.py"):
$ tools/unittests/run.py
test_cmatch:
TestSearch:
test_search_acquires_multiple: OK
test_search_acquires_nested_paren: OK
test_search_acquires_simple: OK
test_search_must_hold: OK
test_search_must_hold_shared: OK
test_search_no_false_positive: OK
test_search_no_function: OK
test_search_no_macro_remains: OK
TestSubMultipleMacros:
test_acquires_multiple: OK
test_acquires_nested_paren: OK
test_acquires_simple: OK
test_mixed_macros: OK
test_must_hold: OK
test_must_hold_shared: OK
test_no_false_positive: OK
test_no_function: OK
test_no_macro_remains: OK
TestSubSimple:
test_rise_early_greedy: OK
test_rise_multiple_greedy: OK
test_strip_multiple_acquires: OK
test_sub_count_parameter: OK
test_sub_mixed_placeholders: OK
test_sub_multiple_placeholders: OK
test_sub_no_placeholder: OK
test_sub_single_placeholder: OK
test_sub_with_capture: OK
test_sub_zero_placeholder: OK
TestSubWithLocalXforms:
test_functions_with_acquires_and_releases: OK
test_raw_struct_group: OK
test_raw_struct_group_tagged: OK
test_struct_group: OK
test_struct_group_attr: OK
test_struct_group_tagged_with_private: OK
test_struct_kcov: OK
test_vars_stackdepot: OK
test_tokenizer:
TestPublicPrivate:
test_balanced_inner_private: OK
test_balanced_non_greddy_private: OK
test_balanced_private: OK
test_no private: OK
test_unbalanced_inner_private: OK
test_unbalanced_private: OK
test_unbalanced_struct_group_tagged_with_private: OK
test_unbalanced_two_struct_group_tagged_first_with_private: OK
test_unbalanced_without_end_of_line: OK
TestTokenizer:
test_basic_tokens: OK
test_depth_counters: OK
test_mismatch_error: OK
Ran 47 tests
Diffstat (limited to 'tools/lib/python/kdoc/xforms_lists.py')
| -rw-r--r-- | tools/lib/python/kdoc/xforms_lists.py | 235 |
1 files changed, 109 insertions, 126 deletions
diff --git a/tools/lib/python/kdoc/xforms_lists.py b/tools/lib/python/kdoc/xforms_lists.py index c07cbe1e6349..f6ea9efb11ae 100644 --- a/tools/lib/python/kdoc/xforms_lists.py +++ b/tools/lib/python/kdoc/xforms_lists.py @@ -4,9 +4,11 @@ import re -from kdoc.kdoc_re import KernRe, NestedMatch +from kdoc.kdoc_re import KernRe +from kdoc.c_lex import CMatch, CTokenizer + +struct_args_pattern = r"([^,)]+)" -struct_args_pattern = r'([^,)]+)' class CTransforms: """ @@ -15,137 +17,106 @@ class CTransforms: into something we can parse and generate kdoc for. """ + # + # NOTE: + # Due to performance reasons, place CMatch rules before KernRe, + # as this avoids running the C parser every time. + # + #: Transforms for structs and unions. struct_xforms = [ - # Strip attributes - (KernRe(r"__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)", flags=re.I | re.S, cache=False), ' '), - (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '), - (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '), - (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '), - (KernRe(r'\s*__guarded_by\s*\([^\)]*\)', re.S), ' '), - (KernRe(r'\s*__pt_guarded_by\s*\([^\)]*\)', re.S), ' '), - (KernRe(r'\s*__packed\s*', re.S), ' '), - (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '), - (KernRe(r'\s*__private', re.S), ' '), - (KernRe(r'\s*__rcu', re.S), ' '), - (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '), - (KernRe(r'\s*____cacheline_aligned', re.S), ' '), - (KernRe(r'\s*__cacheline_group_(begin|end)\([^\)]+\);'), ''), - # - # Unwrap struct_group macros based on this definition: - # __struct_group(TAG, NAME, ATTRS, MEMBERS...) - # which has variants like: struct_group(NAME, MEMBERS...) - # Only MEMBERS arguments require documentation. - # - # Parsing them happens on two steps: - # - # 1. drop struct group arguments that aren't at MEMBERS, - # storing them as STRUCT_GROUP(MEMBERS) - # - # 2. remove STRUCT_GROUP() ancillary macro. - # - # The original logic used to remove STRUCT_GROUP() using an - # advanced regex: - # - # \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*; - # - # with two patterns that are incompatible with - # Python re module, as it has: - # - # - a recursive pattern: (?1) - # - an atomic grouping: (?>...) - # - # I tried a simpler version: but it didn't work either: - # \bSTRUCT_GROUP\(([^\)]+)\)[^;]*; - # - # As it doesn't properly match the end parenthesis on some cases. - # - # So, a better solution was crafted: there's now a NestedMatch - # class that ensures that delimiters after a search are properly - # matched. So, the implementation to drop STRUCT_GROUP() will be - # handled in separate. - # - (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('), - (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('), - (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('), - (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('), - # - # Replace macros - # - # TODO: use NestedMatch for FOO($1, $2, ...) matches + (CMatch("__attribute__"), ""), + (CMatch("__aligned"), ""), + (CMatch("__counted_by"), ""), + (CMatch("__counted_by_(le|be)"), ""), + (CMatch("__guarded_by"), ""), + (CMatch("__pt_guarded_by"), ""), + (CMatch("__packed"), ""), + (CMatch("CRYPTO_MINALIGN_ATTR"), ""), + (CMatch("__private"), ""), + (CMatch("__rcu"), ""), + (CMatch("____cacheline_aligned_in_smp"), ""), + (CMatch("____cacheline_aligned"), ""), + (CMatch("__cacheline_group_(?:begin|end)"), ""), + (CMatch("__ETHTOOL_DECLARE_LINK_MODE_MASK"), r"DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)"), + (CMatch("DECLARE_PHY_INTERFACE_MASK",),r"DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)"), + (CMatch("DECLARE_BITMAP"), r"unsigned long \1[BITS_TO_LONGS(\2)]"), + (CMatch("DECLARE_HASHTABLE"), r"unsigned long \1[1 << ((\2) - 1)]"), + (CMatch("DECLARE_KFIFO"), r"\2 *\1"), + (CMatch("DECLARE_KFIFO_PTR"), r"\2 *\1"), + (CMatch("(?:__)?DECLARE_FLEX_ARRAY"), r"\1 \2[]"), + (CMatch("DEFINE_DMA_UNMAP_ADDR"), r"dma_addr_t \1"), + (CMatch("DEFINE_DMA_UNMAP_LEN"), r"__u32 \1"), + (CMatch("VIRTIO_DECLARE_FEATURES"), r"union { u64 \1; u64 \1_array[VIRTIO_FEATURES_U64S]; }"), + (CMatch("__cond_acquires"), ""), + (CMatch("__cond_releases"), ""), + (CMatch("__acquires"), ""), + (CMatch("__releases"), ""), + (CMatch("__must_hold"), ""), + (CMatch("__must_not_hold"), ""), + (CMatch("__must_hold_shared"), ""), + (CMatch("__cond_acquires_shared"), ""), + (CMatch("__acquires_shared"), ""), + (CMatch("__releases_shared"), ""), + (CMatch("__attribute__"), ""), + # - # it is better to also move those to the NestedMatch logic, - # to ensure that parentheses will be properly matched. + # Macro __struct_group() creates an union with an anonymous + # and a non-anonymous struct, depending on the parameters. We only + # need one of those at kernel-doc, as we won't be documenting the same + # members twice. # - (KernRe(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)', re.S), - r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'), - (KernRe(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^\)]+)\)', re.S), - r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'), - (KernRe(r'DECLARE_BITMAP\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)', - re.S), r'unsigned long \1[BITS_TO_LONGS(\2)]'), - (KernRe(r'DECLARE_HASHTABLE\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)', - re.S), r'unsigned long \1[1 << ((\2) - 1)]'), - (KernRe(r'DECLARE_KFIFO\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + - r',\s*' + struct_args_pattern + r'\)', re.S), r'\2 *\1'), - (KernRe(r'DECLARE_KFIFO_PTR\s*\(' + struct_args_pattern + r',\s*' + - struct_args_pattern + r'\)', re.S), r'\2 *\1'), - (KernRe(r'(?:__)?DECLARE_FLEX_ARRAY\s*\(' + struct_args_pattern + r',\s*' + - struct_args_pattern + r'\)', re.S), r'\1 \2[]'), - (KernRe(r'DEFINE_DMA_UNMAP_ADDR\s*\(' + struct_args_pattern + r'\)', re.S), r'dma_addr_t \1'), - (KernRe(r'DEFINE_DMA_UNMAP_LEN\s*\(' + struct_args_pattern + r'\)', re.S), r'__u32 \1'), - (KernRe(r'VIRTIO_DECLARE_FEATURES\(([\w_]+)\)'), r'union { u64 \1; u64 \1_array[VIRTIO_FEATURES_U64S]; }'), - - (NestedMatch(r"__cond_acquires\s*\("), ""), - (NestedMatch(r"__cond_releases\s*\("), ""), - (NestedMatch(r"__acquires\s*\("), ""), - (NestedMatch(r"__releases\s*\("), ""), - (NestedMatch(r"__must_hold\s*\("), ""), - (NestedMatch(r"__must_not_hold\s*\("), ""), - (NestedMatch(r"__must_hold_shared\s*\("), ""), - (NestedMatch(r"__cond_acquires_shared\s*\("), ""), - (NestedMatch(r"__acquires_shared\s*\("), ""), - (NestedMatch(r"__releases_shared\s*\("), ""), - (NestedMatch(r'\bSTRUCT_GROUP\('), r'\0'), + (CMatch("struct_group"), r"struct { \2+ };"), + (CMatch("struct_group_attr"), r"struct { \3+ };"), + (CMatch("struct_group_tagged"), r"struct { \3+ };"), + (CMatch("__struct_group"), r"struct { \4+ };"), ] #: Transforms for function prototypes. function_xforms = [ - (KernRe(r"^static +"), ""), - (KernRe(r"^extern +"), ""), - (KernRe(r"^asmlinkage +"), ""), - (KernRe(r"^inline +"), ""), - (KernRe(r"^__inline__ +"), ""), - (KernRe(r"^__inline +"), ""), - (KernRe(r"^__always_inline +"), ""), - (KernRe(r"^noinline +"), ""), - (KernRe(r"^__FORTIFY_INLINE +"), ""), - (KernRe(r"__init +"), ""), - (KernRe(r"__init_or_module +"), ""), - (KernRe(r"__exit +"), ""), - (KernRe(r"__deprecated +"), ""), - (KernRe(r"__flatten +"), ""), - (KernRe(r"__meminit +"), ""), - (KernRe(r"__must_check +"), ""), - (KernRe(r"__weak +"), ""), - (KernRe(r"__sched +"), ""), - (KernRe(r"_noprof"), ""), - (KernRe(r"__always_unused *"), ""), - (KernRe(r"__printf\s*\(\s*\d*\s*,\s*\d*\s*\) +"), ""), - (KernRe(r"__(?:re)?alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) +"), ""), - (KernRe(r"__diagnose_as\s*\(\s*\S+\s*(?:,\s*\d+\s*)*\) +"), ""), - (KernRe(r"DECL_BUCKET_PARAMS\s*\(\s*(\S+)\s*,\s*(\S+)\s*\)"), r"\1, \2"), - (KernRe(r"__no_context_analysis\s*"), ""), - (KernRe(r"__attribute_const__ +"), ""), - (KernRe(r"__attribute__\s*\(\((?:[\w\s]+(?:\([^)]*\))?\s*,?)+\)\)\s+"), ""), + (CMatch("static"), ""), + (CMatch("extern"), ""), + (CMatch("asmlinkage"), ""), + (CMatch("inline"), ""), + (CMatch("__inline__"), ""), + (CMatch("__inline"), ""), + (CMatch("__always_inline"), ""), + (CMatch("noinline"), ""), + (CMatch("__FORTIFY_INLINE"), ""), + (CMatch("__init"), ""), + (CMatch("__init_or_module"), ""), + (CMatch("__exit"), ""), + (CMatch("__deprecated"), ""), + (CMatch("__flatten"), ""), + (CMatch("__meminit"), ""), + (CMatch("__must_check"), ""), + (CMatch("__weak"), ""), + (CMatch("__sched"), ""), + (CMatch("__always_unused"), ""), + (CMatch("__printf"), ""), + (CMatch("__(?:re)?alloc_size"), ""), + (CMatch("__diagnose_as"), ""), + (CMatch("DECL_BUCKET_PARAMS"), r"\1, \2"), + (CMatch("__no_context_analysis"), ""), + (CMatch("__attribute_const__"), ""), + (CMatch("__attribute__"), ""), + + # + # HACK: this is similar to process_export() hack. It is meant to + # drop _noproof from function name. See for instance: + # ahash_request_alloc kernel-doc declaration at include/crypto/hash.h. + # + (KernRe("_noprof"), ""), ] #: Transforms for variable prototypes. var_xforms = [ - (KernRe(r"__read_mostly"), ""), - (KernRe(r"__ro_after_init"), ""), - (KernRe(r'\s*__guarded_by\s*\([^\)]*\)', re.S), ""), - (KernRe(r'\s*__pt_guarded_by\s*\([^\)]*\)', re.S), ""), - (KernRe(r"LIST_HEAD\(([\w_]+)\)"), r"struct list_head \1"), + (CMatch("__read_mostly"), ""), + (CMatch("__ro_after_init"), ""), + (CMatch("__guarded_by"), ""), + (CMatch("__pt_guarded_by"), ""), + (CMatch("LIST_HEAD"), r"struct list_head \1"), + (KernRe(r"(?://.*)$"), ""), (KernRe(r"(?:/\*.*\*/)"), ""), (KernRe(r";$"), ""), @@ -158,13 +129,25 @@ class CTransforms: "var": var_xforms, } - def apply(self, xforms_type, text): + def apply(self, xforms_type, source): """ - Apply a set of transforms to a block of text. + Apply a set of transforms to a block of source. + + As tokenizer is used here, this function also remove comments + at the end. """ if xforms_type not in self.xforms: - return text + return source + + if isinstance(source, str): + source = CTokenizer(source) for search, subst in self.xforms[xforms_type]: - text = search.sub(subst, text) - return text + # + # KernRe only accept strings. + # + if isinstance(search, KernRe): + source = str(source) + + source = search.sub(subst, source) + return str(source) |
