Move urlencode/urldecode functions to core lib

2015-08-09 16:28:47 -04:00 · 2015-08-09 16:28:47 -04:00 · 14b4ba83c3
parent 88f42b6132
commit 14b4ba83c3
2 changed files with 135 additions and 32 deletions
--- a/lib/functions.zsh
+++ b/lib/functions.zsh
@ -73,3 +73,137 @@ function env_default() {
    env | grep -q "^$1=" && return 0 
    export "$1=$2"       && return 3
 }
+
+
+# Required for $langinfo
+zmodload zsh/langinfo
+
+# URL-encode a string
+#
+# Encodes a string using RFC 2396 URL-encoding (%-escaped).
+# See: https://www.ietf.org/rfc/rfc2396.txt
+#
+# By default, reserved characters and unreserved "mark" characters are
+# not escaped by this function. This allows the common usage of passing
+# an entire URL in, and encoding just special characters in it, with 
+# the expectation that reserved and mark characters are used appropriately.
+# The -r and -m options turn on escaping of the reserved and mark characters,
+# respectively, which allows arbitrary strings to be fully escaped for
+# embedding inside URLs, where reserved characters might be misinterpreted.
+#
+# Prints the encoded string on stdout.
+# Returns nonzero if encoding failed.
+#
+# Usage:
+#  omz_urlencode [-r] [-m] <string>
+#  
+#    -r causes reserved characters (;/?:@&=+$,) to be escaped
+#
+#    -m causes "mark" characters (_.!~*''()-) to be escaped
+#
+#    -P causes spaces to be encoded as '%20' instead of '+'
+function omz_urlencode() {
+  emulate -L zsh
+  zparseopts -D -E -a opts r m P
+
+  local in_str=$1
+  local url_str=""
+  local spaces_as_plus
+  if [[ -z $opts[(r)-P] ]]; then spaces_as_plus=1; fi
+  local str="$in_str"
+
+  # URLs must use UTF-8 encoding; convert str to UTF-8 if required
+  local encoding=$langinfo[CODESET]
+  local safe_encodings
+  safe_encodings=(UTF-8 utf8 US-ASCII)
+  if [[ -z ${safe_encodings[(r)$encoding]} ]]; then
+    str=$(echo -E "$str" | iconv -f $encoding -t UTF-8)
+    if [[ $? != 0 ]]; then
+      echo "Error converting string from $encoding to UTF-8" >&2
+      return 1
+    fi
+  fi
+
+  # Use LC_CTYPE=C to process text byte-by-byte
+  local i byte ord LC_ALL=C
+  export LC_ALL
+  local reserved=';/?:@&=+$,'
+  local mark='_.!~*''()-'
+  local dont_escape="[A-Za-z0-9"
+  if [[ -z $opts[(r)-r] ]]; then
+    dont_escape+=$reserved
+  fi
+  # $mark must be last because of the "-"
+  if [[ -z $opts[(r)-m] ]]; then
+    dont_escape+=$mark
+  fi
+  dont_escape+="]"
+
+  # Implemented to use a single printf call and avoid subshells in the loop,
+  # for performance (primarily on Windows).
+  local url_str=""
+  for (( i = 1; i <= ${#str}; ++i )); do
+    byte="$str[i]"
+    if [[ "$byte" =~ "$dont_escape" ]]; then
+      url_str+="$byte"
+    else
+      if [[ "$byte" == " " && -n $spaces_as_plus ]]; then
+        url_str+="+"
+      else
+        ord=$(( [##16] #byte ))
+        url_str+="%$ord"
+      fi
+    fi
+  done
+  echo -E "$url_str"
+}
+
+# URL-decode a string
+#
+# Decodes a RFC 2396 URL-encoded (%-escaped) string.
+# This decodes the '+' and '%' escapes in the input string, and leaves 
+# other characters unchanged. Does not enforce that the input is a 
+# valid URL-encoded string. This is a convenience to allow callers to
+# pass in a full URL or similar strings and decode them for human
+# presentation.
+#
+# Outputs the encoded string on stdout.
+# Returns nonzero if encoding failed.
+#
+# Usage:
+#   omz_urldecode <urlstring>  - prints decoded string followed by a newline
+function omz_urldecode {
+  emulate -L zsh
+  local encoded_url=$1
+
+  echo -e input $1
+  # Work bytewise, since URLs escape UTF-8 octets
+  local caller_encoding=$langinfo[CODESET]
+  local LC_ALL=C
+  export LC_ALL
+  
+  # Change + back to ' '
+  local tmp=${encoded_url:gs/+/ /}
+  # Protect other escapes to pass through the printf unchanged
+  tmp=${tmp:gs/\\/\\\\/}
+  # Handle %-escapes by turning them into `\xXX` printf escapes
+  tmp=${tmp:gs/%/\\x/}
+  echo -E "before decode $tmp"
+  local decoded
+  eval "decoded=\$'$tmp'"
+
+  # Now we have a UTF-8 encoded string in the variable. We need to re-encode
+  # it if caller is in a non-UTF-8 locale.
+  local safe_encodings
+  safe_encodings=(UTF-8 utf8 US-ASCII)
+  if [[ -z ${safe_encodings[(r)$caller_encoding]} ]]; then
+    decoded=$(echo -E "$decoded" | iconv -f UTF-8 -t $caller_encoding)
+    if [[ $? != 0 ]]; then
+      echo "Error converting string from UTF-8 to $caller_encoding" >&2
+      return 1
+    fi
+  fi
+
+  echo -E "$decoded"
+}
+
--- a/lib/termsupport.zsh
+++ b/lib/termsupport.zsh
@ -59,44 +59,13 @@ preexec_functions+=(omz_termsupport_preexec)

 if [[ "$TERM_PROGRAM" == "Apple_Terminal" ]] && [[ -z "$INSIDE_EMACS" ]]; then

-  # URL-encodes a string
-  # Outputs the encoded string on stdout
-  # Returns nonzero if encoding failed
-  function _omz_urlencode() {
-    local str=$1
-    local url_str=""
-
-    # URLs must use UTF-8 encoding; convert if required
-    local encoding=${LC_CTYPE/*./}
-    if [[ -n $encoding && $encoding != UTF-8 && $encoding != utf8 ]]; then
-      str=$(echo $str | iconv -f $encoding -t UTF-8)
-      if [[ $? != 0 ]]; then
-        echo "Error converting string from $encoding to UTF-8" >&2
-        return 1
-      fi
-    fi
-
-    # Use LC_CTYPE=C to process text byte-by-byte
-    local i ch hexch LC_CTYPE=C
-    for ((i = 1; i <= ${#str}; ++i)); do
-      ch="$str[i]"
-      if [[ "$ch" =~ [/._~A-Za-z0-9-] ]]; then
-        url_str+="$ch"
-      else
-        hexch=$(printf "%02X" "'$ch")
-        url_str+="%$hexch"
-      fi
-    done
-    echo $url_str
-  }
-
  # Emits the control sequence to notify Terminal.app of the cwd
  function update_terminalapp_cwd() {
    # Identify the directory using a "file:" scheme URL, including
    # the host name to disambiguate local vs. remote paths.

    # Percent-encode the pathname.
-    local URL_PATH=$(_omz_urlencode $PWD)
+    local URL_PATH=$(omz_urlencode -P $PWD)
    [[ $? != 0 ]] && return 1
    local PWD_URL="file://$HOST$URL_PATH"
    # Undocumented Terminal.app-specific control sequence