""" Basic glob match implementation for starlark. This was originally developed by @jbedard for use in rules_js (https://github.com/aspect-build/rules_js/blob/6ca32d5199ddc0bf19bd704f591030dc1468ca5f/npm/private/pkg_glob.bzl) to support the pnpm public-hoist-expr option (https://pnpm.io/npmrc#public-hoist-expr). The pnpm implementation and tests were used as a reference implementation: https://github.com/pnpm/pnpm/blob/v7.4.0-2/packages/matcher/src/index.ts https://github.com/pnpm/pnpm/blob/v7.4.0-2/packages/matcher/test/index.ts """ def _split_on(expr, splits): # Splits an expression on the tokens in splits but keeps the tokens split in the result. # Tokens are matched in order so a token such as `**` should come before `*`. result = [] accumulator = "" skip = 0 has_splits = False for i in range(len(expr)): j = i + skip if j >= len(expr): break for split in splits: if not split: fail("empty split token") if expr[j:].startswith(split): if accumulator: result.append(accumulator) accumulator = "" result.append(split) skip = skip + len(split) j = i + skip has_splits = True break if j >= len(expr): break accumulator = accumulator + expr[j] if accumulator: result.append(accumulator) return result, has_splits GLOB_SYMBOLS = ["**", "*", "?"] def is_glob(expr): """Determine if the passed string is a globa expression Args: expr: the potential glob expression Returns: True if the passed string is a globa expression """ for s in GLOB_SYMBOLS: if -1 != expr.find(s): return True return False def glob_match(expr, path, match_path_separator = False): """Test if the passed path matches the glob expression. `*` A single asterisk stands for zero or more arbitrary characters except for the the path separator `/` if `match_path_separator` is False `?` The question mark stands for exactly one character except for the the path separator `/` if `match_path_separator` is False `**` A double asterisk stands for an arbitrary sequence of 0 or more characters. It is only allowed when preceded by either the beginning of the string or a slash. Likewise it must be followed by a slash or the end of the pattern. Args: expr: the glob expression path: the path against which to match the glob expression match_path_separator: whether or not to match the path separator '/' when matching `*` and `?` expressions Returns: True if the path matches the glob expression """ if expr == "": fail("glob_match: invalid empty glob expression") if not is_glob(expr): # the expression is not a glob (does bot have any glob symbols) so the only match is an exact match return expr == path if expr.find("***") != -1: fail("glob_match: invalid *** pattern found in glob expression") if expr == "**": # matches everything return True expr_parts, has_splits = _split_on(expr, GLOB_SYMBOLS[:]) # Quick exit for simple cases. if not has_splits: return expr == path for i, expr_part in enumerate(expr_parts): if expr_part == "**": if i > 0 and not expr_parts[i - 1].endswith("/"): msg = "glob_match: `**` globstar in expression `{}` must be at the start of the expression or preceeded by `/`".format(expr) fail(msg) if i != len(expr_parts) - 1 and not expr_parts[i + 1].startswith("/"): msg = "glob_match: `**` globstar in expression `{}` must be at the end of the expression or followed by `/`".format(expr) fail(msg) # Locations a * was terminated that can be rolled back to. branches = [] expr_i = 0 path_i = 0 # Loop "forever" (2^30). for _ in range(1073741824): subpath = path[path_i:] if path_i < len(path) else None subexpr = expr_parts[expr_i] if expr_i < len(expr_parts) else None # The next part of the expression. next_pp = expr_parts[expr_i + 1] if expr_i + 1 < len(expr_parts) else None stop_at_leading_path_separator = not match_path_separator and subpath != None and subpath.startswith("/") stop_at_contained_path_separator = not match_path_separator and subpath != None and subpath.find("/") != -1 if (subexpr == "*" and subpath != None and not stop_at_leading_path_separator) or (subexpr == "**" and subpath != None): # A wildcard or globstar in the expression and something to consume. if next_pp == None and not stop_at_contained_path_separator: # This wildcard is the last and matches everything beyond here. return True # If the next part of the expression matches the current subpath # then advance past the wildcard and consume that next expression. if next_pp != None and subpath.startswith(next_pp): # Persist the alternative of using the wildcard instead of advancing. branches.append([expr_i, path_i + 1]) expr_i = expr_i + 1 else: # Otherwise consume the next character. path_i = path_i + 1 elif subexpr == "*" and subpath != None and stop_at_leading_path_separator and next_pp != None and subpath.startswith(next_pp): # A wildcard that has hit a path separator but we can branch # Persist the alternative of using the wildcard instead of advancing. branches.append([expr_i, path_i + 1]) expr_i = expr_i + 1 elif subexpr == "?" and subpath != None and not stop_at_leading_path_separator: # The string matches a ? wildcard at the current location in the path. expr_i = expr_i + 1 path_i = path_i + 1 elif subexpr and subpath != None and subpath.startswith(subexpr): # The string matches the current location in the path. expr_i = expr_i + 1 path_i = path_i + len(subexpr) elif subpath == None and expr_i == len(expr_parts) - 1 and (subexpr == "*" or subexpr == "**"): # Reached the package on a final empty "*" or "**" expression return True elif len(branches) > 0: # The string does not match, backup to the previous branch. [restored_pattern_i, restored_path_i] = branches.pop() path_i = restored_path_i expr_i = restored_pattern_i else: # The string does not match, with no branches to rollback to, there is no match. return False if path_i == len(path) and expr_i == len(expr_parts): # Reached the end of the expression and package. return True fail("glob_match: reached the end of the (in)finite loop")