mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	In preparation for letting kerneldoc Sphinx extension to import Python libraries, move regex ancillary classes to a separate file. Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org> Signed-off-by: Jonathan Corbet <corbet@lwn.net> Link: https://lore.kernel.org/r/64f96b6744435b51894bb4ab7612851d9d054190.1744106241.git.mchehab+huawei@kernel.org
		
			
				
	
	
		
			272 lines
		
	
	
	
		
			7.3 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
			
		
		
	
	
			272 lines
		
	
	
	
		
			7.3 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable file
		
	
	
	
	
#!/usr/bin/env python3
 | 
						|
# SPDX-License-Identifier: GPL-2.0
 | 
						|
# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
 | 
						|
 | 
						|
"""
 | 
						|
Regular expression ancillary classes.
 | 
						|
 | 
						|
Those help caching regular expressions and do matching for kernel-doc.
 | 
						|
"""
 | 
						|
 | 
						|
import re
 | 
						|
 | 
						|
# Local cache for regular expressions
 | 
						|
re_cache = {}
 | 
						|
 | 
						|
 | 
						|
class Re:
 | 
						|
    """
 | 
						|
    Helper class to simplify regex declaration and usage,
 | 
						|
 | 
						|
    It calls re.compile for a given pattern. It also allows adding
 | 
						|
    regular expressions and define sub at class init time.
 | 
						|
 | 
						|
    Regular expressions can be cached via an argument, helping to speedup
 | 
						|
    searches.
 | 
						|
    """
 | 
						|
 | 
						|
    def _add_regex(self, string, flags):
 | 
						|
        """
 | 
						|
        Adds a new regex or re-use it from the cache.
 | 
						|
        """
 | 
						|
 | 
						|
        if string in re_cache:
 | 
						|
            self.regex = re_cache[string]
 | 
						|
        else:
 | 
						|
            self.regex = re.compile(string, flags=flags)
 | 
						|
 | 
						|
            if self.cache:
 | 
						|
                re_cache[string] = self.regex
 | 
						|
 | 
						|
    def __init__(self, string, cache=True, flags=0):
 | 
						|
        """
 | 
						|
        Compile a regular expression and initialize internal vars.
 | 
						|
        """
 | 
						|
 | 
						|
        self.cache = cache
 | 
						|
        self.last_match = None
 | 
						|
 | 
						|
        self._add_regex(string, flags)
 | 
						|
 | 
						|
    def __str__(self):
 | 
						|
        """
 | 
						|
        Return the regular expression pattern.
 | 
						|
        """
 | 
						|
        return self.regex.pattern
 | 
						|
 | 
						|
    def __add__(self, other):
 | 
						|
        """
 | 
						|
        Allows adding two regular expressions into one.
 | 
						|
        """
 | 
						|
 | 
						|
        return Re(str(self) + str(other), cache=self.cache or other.cache,
 | 
						|
                  flags=self.regex.flags | other.regex.flags)
 | 
						|
 | 
						|
    def match(self, string):
 | 
						|
        """
 | 
						|
        Handles a re.match storing its results
 | 
						|
        """
 | 
						|
 | 
						|
        self.last_match = self.regex.match(string)
 | 
						|
        return self.last_match
 | 
						|
 | 
						|
    def search(self, string):
 | 
						|
        """
 | 
						|
        Handles a re.search storing its results
 | 
						|
        """
 | 
						|
 | 
						|
        self.last_match = self.regex.search(string)
 | 
						|
        return self.last_match
 | 
						|
 | 
						|
    def findall(self, string):
 | 
						|
        """
 | 
						|
        Alias to re.findall
 | 
						|
        """
 | 
						|
 | 
						|
        return self.regex.findall(string)
 | 
						|
 | 
						|
    def split(self, string):
 | 
						|
        """
 | 
						|
        Alias to re.split
 | 
						|
        """
 | 
						|
 | 
						|
        return self.regex.split(string)
 | 
						|
 | 
						|
    def sub(self, sub, string, count=0):
 | 
						|
        """
 | 
						|
        Alias to re.sub
 | 
						|
        """
 | 
						|
 | 
						|
        return self.regex.sub(sub, string, count=count)
 | 
						|
 | 
						|
    def group(self, num):
 | 
						|
        """
 | 
						|
        Returns the group results of the last match
 | 
						|
        """
 | 
						|
 | 
						|
        return self.last_match.group(num)
 | 
						|
 | 
						|
 | 
						|
class NestedMatch:
 | 
						|
    """
 | 
						|
    Finding nested delimiters is hard with regular expressions. It is
 | 
						|
    even harder on Python with its normal re module, as there are several
 | 
						|
    advanced regular expressions that are missing.
 | 
						|
 | 
						|
    This is the case of this pattern:
 | 
						|
 | 
						|
            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
 | 
						|
 | 
						|
    which is used to properly match open/close parenthesis of the
 | 
						|
    string search STRUCT_GROUP(),
 | 
						|
 | 
						|
    Add a class that counts pairs of delimiters, using it to match and
 | 
						|
    replace nested expressions.
 | 
						|
 | 
						|
    The original approach was suggested by:
 | 
						|
        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
 | 
						|
 | 
						|
    Although I re-implemented it to make it more generic and match 3 types
 | 
						|
    of delimiters. The logic checks if delimiters are paired. If not, it
 | 
						|
    will ignore the search string.
 | 
						|
    """
 | 
						|
 | 
						|
    # TODO:
 | 
						|
    # Right now, regular expressions to match it are defined only up to
 | 
						|
    #       the start delimiter, e.g.:
 | 
						|
    #
 | 
						|
    #       \bSTRUCT_GROUP\(
 | 
						|
    #
 | 
						|
    # is similar to: STRUCT_GROUP\((.*)\)
 | 
						|
    # except that the content inside the match group is delimiter's aligned.
 | 
						|
    #
 | 
						|
    # The content inside parenthesis are converted into a single replace
 | 
						|
    # group (e.g. r`\1').
 | 
						|
    #
 | 
						|
    # It would be nice to change such definition to support multiple
 | 
						|
    # match groups, allowing a regex equivalent to.
 | 
						|
    #
 | 
						|
    #   FOO\((.*), (.*), (.*)\)
 | 
						|
    #
 | 
						|
    # it is probably easier to define it not as a regular expression, but
 | 
						|
    # with some lexical definition like:
 | 
						|
    #
 | 
						|
    #   FOO(arg1, arg2, arg3)
 | 
						|
 | 
						|
    DELIMITER_PAIRS = {
 | 
						|
        '{': '}',
 | 
						|
        '(': ')',
 | 
						|
        '[': ']',
 | 
						|
    }
 | 
						|
 | 
						|
    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
 | 
						|
 | 
						|
    def _search(self, regex, line):
 | 
						|
        """
 | 
						|
        Finds paired blocks for a regex that ends with a delimiter.
 | 
						|
 | 
						|
        The suggestion of using finditer to match pairs came from:
 | 
						|
        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
 | 
						|
        but I ended using a different implementation to align all three types
 | 
						|
        of delimiters and seek for an initial regular expression.
 | 
						|
 | 
						|
        The algorithm seeks for open/close paired delimiters and place them
 | 
						|
        into a stack, yielding a start/stop position of each match  when the
 | 
						|
        stack is zeroed.
 | 
						|
 | 
						|
        The algorithm shoud work fine for properly paired lines, but will
 | 
						|
        silently ignore end delimiters that preceeds an start delimiter.
 | 
						|
        This should be OK for kernel-doc parser, as unaligned delimiters
 | 
						|
        would cause compilation errors. So, we don't need to rise exceptions
 | 
						|
        to cover such issues.
 | 
						|
        """
 | 
						|
 | 
						|
        stack = []
 | 
						|
 | 
						|
        for match_re in regex.finditer(line):
 | 
						|
            start = match_re.start()
 | 
						|
            offset = match_re.end()
 | 
						|
 | 
						|
            d = line[offset - 1]
 | 
						|
            if d not in self.DELIMITER_PAIRS:
 | 
						|
                continue
 | 
						|
 | 
						|
            end = self.DELIMITER_PAIRS[d]
 | 
						|
            stack.append(end)
 | 
						|
 | 
						|
            for match in self.RE_DELIM.finditer(line[offset:]):
 | 
						|
                pos = match.start() + offset
 | 
						|
 | 
						|
                d = line[pos]
 | 
						|
 | 
						|
                if d in self.DELIMITER_PAIRS:
 | 
						|
                    end = self.DELIMITER_PAIRS[d]
 | 
						|
 | 
						|
                    stack.append(end)
 | 
						|
                    continue
 | 
						|
 | 
						|
                # Does the end delimiter match what it is expected?
 | 
						|
                if stack and d == stack[-1]:
 | 
						|
                    stack.pop()
 | 
						|
 | 
						|
                    if not stack:
 | 
						|
                        yield start, offset, pos + 1
 | 
						|
                        break
 | 
						|
 | 
						|
    def search(self, regex, line):
 | 
						|
        """
 | 
						|
        This is similar to re.search:
 | 
						|
 | 
						|
        It matches a regex that it is followed by a delimiter,
 | 
						|
        returning occurrences only if all delimiters are paired.
 | 
						|
        """
 | 
						|
 | 
						|
        for t in self._search(regex, line):
 | 
						|
 | 
						|
            yield line[t[0]:t[2]]
 | 
						|
 | 
						|
    def sub(self, regex, sub, line, count=0):
 | 
						|
        """
 | 
						|
        This is similar to re.sub:
 | 
						|
 | 
						|
        It matches a regex that it is followed by a delimiter,
 | 
						|
        replacing occurrences only if all delimiters are paired.
 | 
						|
 | 
						|
        if r'\1' is used, it works just like re: it places there the
 | 
						|
        matched paired data with the delimiter stripped.
 | 
						|
 | 
						|
        If count is different than zero, it will replace at most count
 | 
						|
        items.
 | 
						|
        """
 | 
						|
        out = ""
 | 
						|
 | 
						|
        cur_pos = 0
 | 
						|
        n = 0
 | 
						|
 | 
						|
        for start, end, pos in self._search(regex, line):
 | 
						|
            out += line[cur_pos:start]
 | 
						|
 | 
						|
            # Value, ignoring start/end delimiters
 | 
						|
            value = line[end:pos - 1]
 | 
						|
 | 
						|
            # replaces \1 at the sub string, if \1 is used there
 | 
						|
            new_sub = sub
 | 
						|
            new_sub = new_sub.replace(r'\1', value)
 | 
						|
 | 
						|
            out += new_sub
 | 
						|
 | 
						|
            # Drop end ';' if any
 | 
						|
            if line[pos] == ';':
 | 
						|
                pos += 1
 | 
						|
 | 
						|
            cur_pos = pos
 | 
						|
            n += 1
 | 
						|
 | 
						|
            if count and count >= n:
 | 
						|
                break
 | 
						|
 | 
						|
        # Append the remaining string
 | 
						|
        l = len(line)
 | 
						|
        out += line[cur_pos:l]
 | 
						|
 | 
						|
        return out
 |