summaryrefslogtreecommitdiffstats
path: root/tools/lib/python/kdoc/xforms_lists.py
diff options
context:
space:
mode:
authorJonathan Corbet <corbet@lwn.net>2026-03-22 15:06:59 -0600
committerJonathan Corbet <corbet@lwn.net>2026-03-22 15:06:59 -0600
commit781171bec0650c00c642564afcb5cce57abda5bf (patch)
tree31e8cf6d41b40bef9f432ebbe4a8180f32af9fbf /tools/lib/python/kdoc/xforms_lists.py
parent6108c809f4fd9dbb1a138ba4326d645cc3113a8d (diff)
parent7538df7a2d7d26428803cf8053476169a6d28659 (diff)
downloadlinux-781171bec0650c00c642564afcb5cce57abda5bf.tar.gz
linux-781171bec0650c00c642564afcb5cce57abda5bf.zip
Merge branch 'mauro' into docs-mw
Mauro says: This patch series change how kdoc parser handles macro replacements. Instead of heavily relying on regular expressions that can sometimes be very complex, it uses a C lexical tokenizer. This ensures that BEGIN/END blocks on functions and structs are properly handled, even when nested. Checking before/after the patch series, for both man pages and rst only had: - whitespace differences; - struct_group macros now are shown as inner anonimous structs as it should be. Also, I didn't notice any relevant change on the documentation build time. With that regards, right now, every time a CMatch replacement rule takes in place, it does: for each transform: - tokenizes the source code; - handle CMatch; - convert tokens back to a string. A possible optimization would be to do, instead: - tokenizes source code; - for each transform handle CMatch; - convert tokens back to a string. For now, I opted not do do it, because: - too much changes on a single row; - docs build time is taking ~3:30 minutes, which is about the same time it ws taken before the changes; - there is a very dirty hack inside function_xforms: (KernRe(r"_noprof"), ""). This is meant to change function prototypes instead of function arguments. So, if ok for you, I would prefer to merge this one first. We can later optimize kdoc_parser to avoid multiple token <-> string conversions. - One important aspect of this series is that it introduces unittests for kernel-doc. I used it a lot during the development of this series, to ensure that the changes I was doing were producing the expected results. Tests are on two separate files that can be executed directly. Alternatively, there is a run.py script that runs all of them (and any other python script named tools/unittests/test_*.py"): $ tools/unittests/run.py test_cmatch: TestSearch: test_search_acquires_multiple: OK test_search_acquires_nested_paren: OK test_search_acquires_simple: OK test_search_must_hold: OK test_search_must_hold_shared: OK test_search_no_false_positive: OK test_search_no_function: OK test_search_no_macro_remains: OK TestSubMultipleMacros: test_acquires_multiple: OK test_acquires_nested_paren: OK test_acquires_simple: OK test_mixed_macros: OK test_must_hold: OK test_must_hold_shared: OK test_no_false_positive: OK test_no_function: OK test_no_macro_remains: OK TestSubSimple: test_rise_early_greedy: OK test_rise_multiple_greedy: OK test_strip_multiple_acquires: OK test_sub_count_parameter: OK test_sub_mixed_placeholders: OK test_sub_multiple_placeholders: OK test_sub_no_placeholder: OK test_sub_single_placeholder: OK test_sub_with_capture: OK test_sub_zero_placeholder: OK TestSubWithLocalXforms: test_functions_with_acquires_and_releases: OK test_raw_struct_group: OK test_raw_struct_group_tagged: OK test_struct_group: OK test_struct_group_attr: OK test_struct_group_tagged_with_private: OK test_struct_kcov: OK test_vars_stackdepot: OK test_tokenizer: TestPublicPrivate: test_balanced_inner_private: OK test_balanced_non_greddy_private: OK test_balanced_private: OK test_no private: OK test_unbalanced_inner_private: OK test_unbalanced_private: OK test_unbalanced_struct_group_tagged_with_private: OK test_unbalanced_two_struct_group_tagged_first_with_private: OK test_unbalanced_without_end_of_line: OK TestTokenizer: test_basic_tokens: OK test_depth_counters: OK test_mismatch_error: OK Ran 47 tests
Diffstat (limited to 'tools/lib/python/kdoc/xforms_lists.py')
-rw-r--r--tools/lib/python/kdoc/xforms_lists.py235
1 files changed, 109 insertions, 126 deletions
diff --git a/tools/lib/python/kdoc/xforms_lists.py b/tools/lib/python/kdoc/xforms_lists.py
index c07cbe1e6349..f6ea9efb11ae 100644
--- a/tools/lib/python/kdoc/xforms_lists.py
+++ b/tools/lib/python/kdoc/xforms_lists.py
@@ -4,9 +4,11 @@
import re
-from kdoc.kdoc_re import KernRe, NestedMatch
+from kdoc.kdoc_re import KernRe
+from kdoc.c_lex import CMatch, CTokenizer
+
+struct_args_pattern = r"([^,)]+)"
-struct_args_pattern = r'([^,)]+)'
class CTransforms:
"""
@@ -15,137 +17,106 @@ class CTransforms:
into something we can parse and generate kdoc for.
"""
+ #
+ # NOTE:
+ # Due to performance reasons, place CMatch rules before KernRe,
+ # as this avoids running the C parser every time.
+ #
+
#: Transforms for structs and unions.
struct_xforms = [
- # Strip attributes
- (KernRe(r"__attribute__\s*\(\([a-z0-9,_\*\s\(\)]*\)\)", flags=re.I | re.S, cache=False), ' '),
- (KernRe(r'\s*__aligned\s*\([^;]*\)', re.S), ' '),
- (KernRe(r'\s*__counted_by\s*\([^;]*\)', re.S), ' '),
- (KernRe(r'\s*__counted_by_(le|be)\s*\([^;]*\)', re.S), ' '),
- (KernRe(r'\s*__guarded_by\s*\([^\)]*\)', re.S), ' '),
- (KernRe(r'\s*__pt_guarded_by\s*\([^\)]*\)', re.S), ' '),
- (KernRe(r'\s*__packed\s*', re.S), ' '),
- (KernRe(r'\s*CRYPTO_MINALIGN_ATTR', re.S), ' '),
- (KernRe(r'\s*__private', re.S), ' '),
- (KernRe(r'\s*__rcu', re.S), ' '),
- (KernRe(r'\s*____cacheline_aligned_in_smp', re.S), ' '),
- (KernRe(r'\s*____cacheline_aligned', re.S), ' '),
- (KernRe(r'\s*__cacheline_group_(begin|end)\([^\)]+\);'), ''),
- #
- # Unwrap struct_group macros based on this definition:
- # __struct_group(TAG, NAME, ATTRS, MEMBERS...)
- # which has variants like: struct_group(NAME, MEMBERS...)
- # Only MEMBERS arguments require documentation.
- #
- # Parsing them happens on two steps:
- #
- # 1. drop struct group arguments that aren't at MEMBERS,
- # storing them as STRUCT_GROUP(MEMBERS)
- #
- # 2. remove STRUCT_GROUP() ancillary macro.
- #
- # The original logic used to remove STRUCT_GROUP() using an
- # advanced regex:
- #
- # \bSTRUCT_GROUP(\(((?:(?>[^)(]+)|(?1))*)\))[^;]*;
- #
- # with two patterns that are incompatible with
- # Python re module, as it has:
- #
- # - a recursive pattern: (?1)
- # - an atomic grouping: (?>...)
- #
- # I tried a simpler version: but it didn't work either:
- # \bSTRUCT_GROUP\(([^\)]+)\)[^;]*;
- #
- # As it doesn't properly match the end parenthesis on some cases.
- #
- # So, a better solution was crafted: there's now a NestedMatch
- # class that ensures that delimiters after a search are properly
- # matched. So, the implementation to drop STRUCT_GROUP() will be
- # handled in separate.
- #
- (KernRe(r'\bstruct_group\s*\(([^,]*,)', re.S), r'STRUCT_GROUP('),
- (KernRe(r'\bstruct_group_attr\s*\(([^,]*,){2}', re.S), r'STRUCT_GROUP('),
- (KernRe(r'\bstruct_group_tagged\s*\(([^,]*),([^,]*),', re.S), r'struct \1 \2; STRUCT_GROUP('),
- (KernRe(r'\b__struct_group\s*\(([^,]*,){3}', re.S), r'STRUCT_GROUP('),
- #
- # Replace macros
- #
- # TODO: use NestedMatch for FOO($1, $2, ...) matches
+ (CMatch("__attribute__"), ""),
+ (CMatch("__aligned"), ""),
+ (CMatch("__counted_by"), ""),
+ (CMatch("__counted_by_(le|be)"), ""),
+ (CMatch("__guarded_by"), ""),
+ (CMatch("__pt_guarded_by"), ""),
+ (CMatch("__packed"), ""),
+ (CMatch("CRYPTO_MINALIGN_ATTR"), ""),
+ (CMatch("__private"), ""),
+ (CMatch("__rcu"), ""),
+ (CMatch("____cacheline_aligned_in_smp"), ""),
+ (CMatch("____cacheline_aligned"), ""),
+ (CMatch("__cacheline_group_(?:begin|end)"), ""),
+ (CMatch("__ETHTOOL_DECLARE_LINK_MODE_MASK"), r"DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)"),
+ (CMatch("DECLARE_PHY_INTERFACE_MASK",),r"DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)"),
+ (CMatch("DECLARE_BITMAP"), r"unsigned long \1[BITS_TO_LONGS(\2)]"),
+ (CMatch("DECLARE_HASHTABLE"), r"unsigned long \1[1 << ((\2) - 1)]"),
+ (CMatch("DECLARE_KFIFO"), r"\2 *\1"),
+ (CMatch("DECLARE_KFIFO_PTR"), r"\2 *\1"),
+ (CMatch("(?:__)?DECLARE_FLEX_ARRAY"), r"\1 \2[]"),
+ (CMatch("DEFINE_DMA_UNMAP_ADDR"), r"dma_addr_t \1"),
+ (CMatch("DEFINE_DMA_UNMAP_LEN"), r"__u32 \1"),
+ (CMatch("VIRTIO_DECLARE_FEATURES"), r"union { u64 \1; u64 \1_array[VIRTIO_FEATURES_U64S]; }"),
+ (CMatch("__cond_acquires"), ""),
+ (CMatch("__cond_releases"), ""),
+ (CMatch("__acquires"), ""),
+ (CMatch("__releases"), ""),
+ (CMatch("__must_hold"), ""),
+ (CMatch("__must_not_hold"), ""),
+ (CMatch("__must_hold_shared"), ""),
+ (CMatch("__cond_acquires_shared"), ""),
+ (CMatch("__acquires_shared"), ""),
+ (CMatch("__releases_shared"), ""),
+ (CMatch("__attribute__"), ""),
+
#
- # it is better to also move those to the NestedMatch logic,
- # to ensure that parentheses will be properly matched.
+ # Macro __struct_group() creates an union with an anonymous
+ # and a non-anonymous struct, depending on the parameters. We only
+ # need one of those at kernel-doc, as we won't be documenting the same
+ # members twice.
#
- (KernRe(r'__ETHTOOL_DECLARE_LINK_MODE_MASK\s*\(([^\)]+)\)', re.S),
- r'DECLARE_BITMAP(\1, __ETHTOOL_LINK_MODE_MASK_NBITS)'),
- (KernRe(r'DECLARE_PHY_INTERFACE_MASK\s*\(([^\)]+)\)', re.S),
- r'DECLARE_BITMAP(\1, PHY_INTERFACE_MODE_MAX)'),
- (KernRe(r'DECLARE_BITMAP\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)',
- re.S), r'unsigned long \1[BITS_TO_LONGS(\2)]'),
- (KernRe(r'DECLARE_HASHTABLE\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern + r'\)',
- re.S), r'unsigned long \1[1 << ((\2) - 1)]'),
- (KernRe(r'DECLARE_KFIFO\s*\(' + struct_args_pattern + r',\s*' + struct_args_pattern +
- r',\s*' + struct_args_pattern + r'\)', re.S), r'\2 *\1'),
- (KernRe(r'DECLARE_KFIFO_PTR\s*\(' + struct_args_pattern + r',\s*' +
- struct_args_pattern + r'\)', re.S), r'\2 *\1'),
- (KernRe(r'(?:__)?DECLARE_FLEX_ARRAY\s*\(' + struct_args_pattern + r',\s*' +
- struct_args_pattern + r'\)', re.S), r'\1 \2[]'),
- (KernRe(r'DEFINE_DMA_UNMAP_ADDR\s*\(' + struct_args_pattern + r'\)', re.S), r'dma_addr_t \1'),
- (KernRe(r'DEFINE_DMA_UNMAP_LEN\s*\(' + struct_args_pattern + r'\)', re.S), r'__u32 \1'),
- (KernRe(r'VIRTIO_DECLARE_FEATURES\(([\w_]+)\)'), r'union { u64 \1; u64 \1_array[VIRTIO_FEATURES_U64S]; }'),
-
- (NestedMatch(r"__cond_acquires\s*\("), ""),
- (NestedMatch(r"__cond_releases\s*\("), ""),
- (NestedMatch(r"__acquires\s*\("), ""),
- (NestedMatch(r"__releases\s*\("), ""),
- (NestedMatch(r"__must_hold\s*\("), ""),
- (NestedMatch(r"__must_not_hold\s*\("), ""),
- (NestedMatch(r"__must_hold_shared\s*\("), ""),
- (NestedMatch(r"__cond_acquires_shared\s*\("), ""),
- (NestedMatch(r"__acquires_shared\s*\("), ""),
- (NestedMatch(r"__releases_shared\s*\("), ""),
- (NestedMatch(r'\bSTRUCT_GROUP\('), r'\0'),
+ (CMatch("struct_group"), r"struct { \2+ };"),
+ (CMatch("struct_group_attr"), r"struct { \3+ };"),
+ (CMatch("struct_group_tagged"), r"struct { \3+ };"),
+ (CMatch("__struct_group"), r"struct { \4+ };"),
]
#: Transforms for function prototypes.
function_xforms = [
- (KernRe(r"^static +"), ""),
- (KernRe(r"^extern +"), ""),
- (KernRe(r"^asmlinkage +"), ""),
- (KernRe(r"^inline +"), ""),
- (KernRe(r"^__inline__ +"), ""),
- (KernRe(r"^__inline +"), ""),
- (KernRe(r"^__always_inline +"), ""),
- (KernRe(r"^noinline +"), ""),
- (KernRe(r"^__FORTIFY_INLINE +"), ""),
- (KernRe(r"__init +"), ""),
- (KernRe(r"__init_or_module +"), ""),
- (KernRe(r"__exit +"), ""),
- (KernRe(r"__deprecated +"), ""),
- (KernRe(r"__flatten +"), ""),
- (KernRe(r"__meminit +"), ""),
- (KernRe(r"__must_check +"), ""),
- (KernRe(r"__weak +"), ""),
- (KernRe(r"__sched +"), ""),
- (KernRe(r"_noprof"), ""),
- (KernRe(r"__always_unused *"), ""),
- (KernRe(r"__printf\s*\(\s*\d*\s*,\s*\d*\s*\) +"), ""),
- (KernRe(r"__(?:re)?alloc_size\s*\(\s*\d+\s*(?:,\s*\d+\s*)?\) +"), ""),
- (KernRe(r"__diagnose_as\s*\(\s*\S+\s*(?:,\s*\d+\s*)*\) +"), ""),
- (KernRe(r"DECL_BUCKET_PARAMS\s*\(\s*(\S+)\s*,\s*(\S+)\s*\)"), r"\1, \2"),
- (KernRe(r"__no_context_analysis\s*"), ""),
- (KernRe(r"__attribute_const__ +"), ""),
- (KernRe(r"__attribute__\s*\(\((?:[\w\s]+(?:\([^)]*\))?\s*,?)+\)\)\s+"), ""),
+ (CMatch("static"), ""),
+ (CMatch("extern"), ""),
+ (CMatch("asmlinkage"), ""),
+ (CMatch("inline"), ""),
+ (CMatch("__inline__"), ""),
+ (CMatch("__inline"), ""),
+ (CMatch("__always_inline"), ""),
+ (CMatch("noinline"), ""),
+ (CMatch("__FORTIFY_INLINE"), ""),
+ (CMatch("__init"), ""),
+ (CMatch("__init_or_module"), ""),
+ (CMatch("__exit"), ""),
+ (CMatch("__deprecated"), ""),
+ (CMatch("__flatten"), ""),
+ (CMatch("__meminit"), ""),
+ (CMatch("__must_check"), ""),
+ (CMatch("__weak"), ""),
+ (CMatch("__sched"), ""),
+ (CMatch("__always_unused"), ""),
+ (CMatch("__printf"), ""),
+ (CMatch("__(?:re)?alloc_size"), ""),
+ (CMatch("__diagnose_as"), ""),
+ (CMatch("DECL_BUCKET_PARAMS"), r"\1, \2"),
+ (CMatch("__no_context_analysis"), ""),
+ (CMatch("__attribute_const__"), ""),
+ (CMatch("__attribute__"), ""),
+
+ #
+ # HACK: this is similar to process_export() hack. It is meant to
+ # drop _noproof from function name. See for instance:
+ # ahash_request_alloc kernel-doc declaration at include/crypto/hash.h.
+ #
+ (KernRe("_noprof"), ""),
]
#: Transforms for variable prototypes.
var_xforms = [
- (KernRe(r"__read_mostly"), ""),
- (KernRe(r"__ro_after_init"), ""),
- (KernRe(r'\s*__guarded_by\s*\([^\)]*\)', re.S), ""),
- (KernRe(r'\s*__pt_guarded_by\s*\([^\)]*\)', re.S), ""),
- (KernRe(r"LIST_HEAD\(([\w_]+)\)"), r"struct list_head \1"),
+ (CMatch("__read_mostly"), ""),
+ (CMatch("__ro_after_init"), ""),
+ (CMatch("__guarded_by"), ""),
+ (CMatch("__pt_guarded_by"), ""),
+ (CMatch("LIST_HEAD"), r"struct list_head \1"),
+
(KernRe(r"(?://.*)$"), ""),
(KernRe(r"(?:/\*.*\*/)"), ""),
(KernRe(r";$"), ""),
@@ -158,13 +129,25 @@ class CTransforms:
"var": var_xforms,
}
- def apply(self, xforms_type, text):
+ def apply(self, xforms_type, source):
"""
- Apply a set of transforms to a block of text.
+ Apply a set of transforms to a block of source.
+
+ As tokenizer is used here, this function also remove comments
+ at the end.
"""
if xforms_type not in self.xforms:
- return text
+ return source
+
+ if isinstance(source, str):
+ source = CTokenizer(source)
for search, subst in self.xforms[xforms_type]:
- text = search.sub(subst, text)
- return text
+ #
+ # KernRe only accept strings.
+ #
+ if isinstance(search, KernRe):
+ source = str(source)
+
+ source = search.sub(subst, source)
+ return str(source)