From a945d66eb5d7981f6ed4758c5444666417908b62 Mon Sep 17 00:00:00 2001 From: Greg Magolan Date: Thu, 28 Jul 2022 11:15:14 -0700 Subject: [PATCH] chore: add glob_match to docs (#195) --- docs/BUILD.bazel | 5 ++ docs/glob_match.md | 35 +++++++++ lib/BUILD.bazel | 9 +++ lib/glob_match.bzl | 141 +------------------------------------ lib/private/BUILD.bazel | 6 ++ lib/private/glob_match.bzl | 140 ++++++++++++++++++++++++++++++++++++ 6 files changed, 198 insertions(+), 138 deletions(-) create mode 100755 docs/glob_match.md create mode 100644 lib/private/glob_match.bzl diff --git a/docs/BUILD.bazel b/docs/BUILD.bazel index 20285f9..26694fe 100644 --- a/docs/BUILD.bazel +++ b/docs/BUILD.bazel @@ -88,6 +88,11 @@ stardoc_with_diff_test( bzl_library_target = "//lib:yq", ) +stardoc_with_diff_test( + name = "glob_match", + bzl_library_target = "//lib:glob_match", +) + stardoc_with_diff_test( name = "host_repo", bzl_library_target = "//lib:host_repo", diff --git a/docs/glob_match.md b/docs/glob_match.md new file mode 100755 index 0000000..5c9a4c3 --- /dev/null +++ b/docs/glob_match.md @@ -0,0 +1,35 @@ + + +Public API + + + +## glob_match + +
+glob_match(expr, path, match_path_separator)
+
+ +Test if the passed path matches the glob expression. + +`*` A single asterisk stands for zero or more arbitrary characters except for the the path separator `/` if `match_path_separator` is False + +`?` The question mark stands for exactly one character except for the the path separator `/` if `match_path_separator` is False + +`**` A double asterisk stands for an arbitrary sequence of 0 or more characters. It is only allowed when preceded by either the beginning of the string or a slash. Likewise it must be followed by a slash or the end of the pattern. + + +**PARAMETERS** + + +| Name | Description | Default Value | +| :------------- | :------------- | :------------- | +| expr | the glob expression | none | +| path | the path against which to match the glob expression | none | +| match_path_separator | whether or not to match the path separator '/' when matching * and ? expressions | False | + +**RETURNS** + +True if the path matches the glob expression + + diff --git a/lib/BUILD.bazel b/lib/BUILD.bazel index e564fe6..a1cb65e 100644 --- a/lib/BUILD.bazel +++ b/lib/BUILD.bazel @@ -161,6 +161,15 @@ bzl_library( deps = ["//lib/private:yq"], ) +bzl_library( + name = "glob_match", + srcs = ["glob_match.bzl"], + visibility = ["//visibility:public"], + deps = [ + "//lib/private:glob_match", + ], +) + bzl_library( name = "host_repo", srcs = ["host_repo.bzl"], diff --git a/lib/glob_match.bzl b/lib/glob_match.bzl index dfe31ec..2b649dc 100644 --- a/lib/glob_match.bzl +++ b/lib/glob_match.bzl @@ -1,140 +1,5 @@ -""" -Basic glob match implementation for starlark. +"Public API" -This was originally developed by @jbedard for use in rules_js -(https://github.com/aspect-build/rules_js/blob/6ca32d5199ddc0bf19bd704f591030dc1468ca5f/npm/private/pkg_glob.bzl) -to support the pnpm public-hoist-expr option (https://pnpm.io/npmrc#public-hoist-expr). The pnpm -implementation and tests were used as a reference implementation: - https://github.com/pnpm/pnpm/blob/v7.4.0-2/packages/matcher/src/index.ts - https://github.com/pnpm/pnpm/blob/v7.4.0-2/packages/matcher/test/index.ts -""" +load("//lib/private:glob_match.bzl", _glob_match = "glob_match") -def _split_on(expr, splits): - # Splits an expression on the tokens in splits but keeps the tokens split in the result. - # Tokens are matched in order so a token such as `**` should come before `*`. - result = [] - accumulator = "" - skip = 0 - for i in range(len(expr)): - j = i + skip - if j >= len(expr): - break - for split in splits: - if not split: - fail("empty split token") - if expr[j:].startswith(split): - if accumulator: - result.append(accumulator) - accumulator = "" - result.append(split) - skip = skip + len(split) - j = i + skip - break - if j >= len(expr): - break - accumulator = accumulator + expr[j] - if accumulator: - result.append(accumulator) - return result - -def glob_match(expr, path, match_path_separator = False): - """Test if the passed path matches the glob expression. - - `*` A single asterisk stands for zero or more arbitrary characters except for the the path separator `/` if `match_path_separator` is False - - `?` The question mark stands for exactly one character except for the the path separator `/` if `match_path_separator` is False - - `**` A double asterisk stands for an arbitrary sequence of 0 or more characters. It is only allowed when preceded by either the beginning of the string or a slash. Likewise it must be followed by a slash or the end of the pattern. - - Args: - expr: the glob expression - path: the path against which to match the glob expression - match_path_separator: whether or not to match the path separator '/' when matching `*` and `?` expressions - - Returns: - True if the path matches the glob expression - """ - - expr_i = 0 - path_i = 0 - - if expr.find("***") != -1: - fail("glob_match: invalid *** pattern found in glob expression") - - expr_parts = _split_on(expr, ["**", "*", "?"]) - - for i, expr_part in enumerate(expr_parts): - if expr_part == "**": - if i > 0 and not expr_parts[i - 1].endswith("/"): - msg = "glob_match: `**` globstar in expression `{}` must be at the start of the expression or preceeded by `/`".format(expr) - fail(msg) - if i != len(expr_parts) - 1 and not expr_parts[i + 1].startswith("/"): - msg = "glob_match: `**` globstar in expression `{}` must be at the end of the expression or followed by `/`".format(expr) - fail(msg) - - # Locations a * was terminated that can be rolled back to. - branches = [] - - # Loop "forever" (2^30). - for _ in range(1073741824): - subpath = path[path_i:] if path_i < len(path) else None - subexpr = expr_parts[expr_i] if expr_i < len(expr_parts) else None - - # The next part of the expression. - next_pp = expr_parts[expr_i + 1] if expr_i + 1 < len(expr_parts) else None - - stop_at_leading_path_separator = not match_path_separator and subpath != None and subpath.startswith("/") - stop_at_contained_path_separator = not match_path_separator and subpath != None and subpath.find("/") != -1 - - if (subexpr == "*" and subpath != None and not stop_at_leading_path_separator) or (subexpr == "**" and subpath != None): - # A wildcard or globstar in the expression and something to consume. - if next_pp == None and not stop_at_contained_path_separator: - # This wildcard is the last and matches everything beyond here. - return True - - # If the next part of the expression matches the current subpath - # then advance past the wildcard and consume that next expression. - if next_pp != None and subpath.startswith(next_pp): - # Persist the alternative of using the wildcard instead of advancing. - branches.append([expr_i, path_i + 1]) - expr_i = expr_i + 1 - else: - # Otherwise consume the next character. - path_i = path_i + 1 - - elif subexpr == "*" and subpath != None and stop_at_leading_path_separator and next_pp != None and subpath.startswith(next_pp): - # A wildcard that has hit a path separator but we can branch - # Persist the alternative of using the wildcard instead of advancing. - branches.append([expr_i, path_i + 1]) - expr_i = expr_i + 1 - - elif subexpr == "?" and subpath != None and not stop_at_leading_path_separator: - # The string matches a ? wildcard at the current location in the path. - expr_i = expr_i + 1 - path_i = path_i + 1 - - elif subexpr and subpath != None and subpath.startswith(subexpr): - # The string matches the current location in the path. - expr_i = expr_i + 1 - path_i = path_i + len(subexpr) - - elif subpath == None and expr_i == len(expr_parts) - 1 and (subexpr == "*" or subexpr == "**"): - # Reached the package on a final empty "*" or "**" expression - return True - - elif len(branches) > 0: - # The string does not match, backup to the previous branch. - [restored_pattern_i, restored_path_i] = branches.pop() - - path_i = restored_path_i - expr_i = restored_pattern_i - - else: - # The string does not match, with no branches to rollback to, there is no match. - return False - - if path_i == len(path) and expr_i == len(expr_parts): - # Reached the end of the expression and package. - return True - - fail("glob_match: reached the end of the (in)finite loop") +glob_match = _glob_match diff --git a/lib/private/BUILD.bazel b/lib/private/BUILD.bazel index 75e7a03..cc59548 100644 --- a/lib/private/BUILD.bazel +++ b/lib/private/BUILD.bazel @@ -113,6 +113,12 @@ bzl_library( visibility = ["//lib:__subpackages__"], ) +bzl_library( + name = "glob_match", + srcs = ["glob_match.bzl"], + visibility = ["//lib:__subpackages__"], +) + bzl_library( name = "directory_path", srcs = ["directory_path.bzl"], diff --git a/lib/private/glob_match.bzl b/lib/private/glob_match.bzl new file mode 100644 index 0000000..dfe31ec --- /dev/null +++ b/lib/private/glob_match.bzl @@ -0,0 +1,140 @@ +""" +Basic glob match implementation for starlark. + +This was originally developed by @jbedard for use in rules_js +(https://github.com/aspect-build/rules_js/blob/6ca32d5199ddc0bf19bd704f591030dc1468ca5f/npm/private/pkg_glob.bzl) +to support the pnpm public-hoist-expr option (https://pnpm.io/npmrc#public-hoist-expr). The pnpm +implementation and tests were used as a reference implementation: + https://github.com/pnpm/pnpm/blob/v7.4.0-2/packages/matcher/src/index.ts + https://github.com/pnpm/pnpm/blob/v7.4.0-2/packages/matcher/test/index.ts +""" + +def _split_on(expr, splits): + # Splits an expression on the tokens in splits but keeps the tokens split in the result. + # Tokens are matched in order so a token such as `**` should come before `*`. + result = [] + accumulator = "" + skip = 0 + for i in range(len(expr)): + j = i + skip + if j >= len(expr): + break + for split in splits: + if not split: + fail("empty split token") + if expr[j:].startswith(split): + if accumulator: + result.append(accumulator) + accumulator = "" + result.append(split) + skip = skip + len(split) + j = i + skip + break + if j >= len(expr): + break + accumulator = accumulator + expr[j] + if accumulator: + result.append(accumulator) + return result + +def glob_match(expr, path, match_path_separator = False): + """Test if the passed path matches the glob expression. + + `*` A single asterisk stands for zero or more arbitrary characters except for the the path separator `/` if `match_path_separator` is False + + `?` The question mark stands for exactly one character except for the the path separator `/` if `match_path_separator` is False + + `**` A double asterisk stands for an arbitrary sequence of 0 or more characters. It is only allowed when preceded by either the beginning of the string or a slash. Likewise it must be followed by a slash or the end of the pattern. + + Args: + expr: the glob expression + path: the path against which to match the glob expression + match_path_separator: whether or not to match the path separator '/' when matching `*` and `?` expressions + + Returns: + True if the path matches the glob expression + """ + + expr_i = 0 + path_i = 0 + + if expr.find("***") != -1: + fail("glob_match: invalid *** pattern found in glob expression") + + expr_parts = _split_on(expr, ["**", "*", "?"]) + + for i, expr_part in enumerate(expr_parts): + if expr_part == "**": + if i > 0 and not expr_parts[i - 1].endswith("/"): + msg = "glob_match: `**` globstar in expression `{}` must be at the start of the expression or preceeded by `/`".format(expr) + fail(msg) + if i != len(expr_parts) - 1 and not expr_parts[i + 1].startswith("/"): + msg = "glob_match: `**` globstar in expression `{}` must be at the end of the expression or followed by `/`".format(expr) + fail(msg) + + # Locations a * was terminated that can be rolled back to. + branches = [] + + # Loop "forever" (2^30). + for _ in range(1073741824): + subpath = path[path_i:] if path_i < len(path) else None + subexpr = expr_parts[expr_i] if expr_i < len(expr_parts) else None + + # The next part of the expression. + next_pp = expr_parts[expr_i + 1] if expr_i + 1 < len(expr_parts) else None + + stop_at_leading_path_separator = not match_path_separator and subpath != None and subpath.startswith("/") + stop_at_contained_path_separator = not match_path_separator and subpath != None and subpath.find("/") != -1 + + if (subexpr == "*" and subpath != None and not stop_at_leading_path_separator) or (subexpr == "**" and subpath != None): + # A wildcard or globstar in the expression and something to consume. + if next_pp == None and not stop_at_contained_path_separator: + # This wildcard is the last and matches everything beyond here. + return True + + # If the next part of the expression matches the current subpath + # then advance past the wildcard and consume that next expression. + if next_pp != None and subpath.startswith(next_pp): + # Persist the alternative of using the wildcard instead of advancing. + branches.append([expr_i, path_i + 1]) + expr_i = expr_i + 1 + else: + # Otherwise consume the next character. + path_i = path_i + 1 + + elif subexpr == "*" and subpath != None and stop_at_leading_path_separator and next_pp != None and subpath.startswith(next_pp): + # A wildcard that has hit a path separator but we can branch + # Persist the alternative of using the wildcard instead of advancing. + branches.append([expr_i, path_i + 1]) + expr_i = expr_i + 1 + + elif subexpr == "?" and subpath != None and not stop_at_leading_path_separator: + # The string matches a ? wildcard at the current location in the path. + expr_i = expr_i + 1 + path_i = path_i + 1 + + elif subexpr and subpath != None and subpath.startswith(subexpr): + # The string matches the current location in the path. + expr_i = expr_i + 1 + path_i = path_i + len(subexpr) + + elif subpath == None and expr_i == len(expr_parts) - 1 and (subexpr == "*" or subexpr == "**"): + # Reached the package on a final empty "*" or "**" expression + return True + + elif len(branches) > 0: + # The string does not match, backup to the previous branch. + [restored_pattern_i, restored_path_i] = branches.pop() + + path_i = restored_path_i + expr_i = restored_pattern_i + + else: + # The string does not match, with no branches to rollback to, there is no match. + return False + + if path_i == len(path) and expr_i == len(expr_parts): + # Reached the end of the expression and package. + return True + + fail("glob_match: reached the end of the (in)finite loop")