New module regsub contains sub(), gsub() and split() as in nawk.

author Guido van Rossum <guido@python.org>

Sun, 20 Sep 1992 21:41:09 +0000 (21:41 +0000)

committer Guido van Rossum <guido@python.org>

Sun, 20 Sep 1992 21:41:09 +0000 (21:41 +0000)
author Guido van Rossum <guido@python.org>
Sun, 20 Sep 1992 21:41:09 +0000 (21:41 +0000)
committer Guido van Rossum <guido@python.org>
Sun, 20 Sep 1992 21:41:09 +0000 (21:41 +0000)
diff --git a/Lib/os.py b/Lib/os.py

index af0ce8956043d085745803df7f5f41c02725783a..556268fe4e89a7f14be1494f5fc594f8e35150ff 100644 (file)
--- a/Lib/os.py
+++ b/Lib/os.py
@@ -18,6 +18,7 @@
  
  try:
         from posix import *
+       from posix import _exit
         name = 'posix'
         curdir = '.'
         pardir = '..'
diff --git a/Lib/regsub.py b/Lib/regsub.py

new file mode 100644 (file)

index 0000000..7eb175b
--- /dev/null
+++ b/Lib/regsub.py
@@ -0,0 +1,147 @@
+# Regular expression subroutines:
+# sub(pat, repl, str): replace first occurrence of pattern in string
+# gsub(pat, repl, str): replace all occurrences of pattern in string
+# split(str, pat): split string using pattern as delimiter
+
+
+import regex
+
+
+# Replace first occurrence of pattern pat in string str by replacement
+# repl.  If the pattern isn't found, the string is returned unchanged.
+# The replacement may contain references \digit to subpatterns and
+# escaped backslashes.  The pattern may be a string or an already
+# compiled pattern.
+
+def sub(pat, repl, str):
+       prog = compile(pat)
+       if prog.search(str) >= 0:
+               regs = prog.regs
+               a, b = regs[0]
+               str = str[:a] + expand(repl, regs, str) + str[b:]
+       return str
+
+
+# Replace all (non-overlapping) occurrences of pattern pat in string
+# str by replacement repl.  The same rules as for sub() apply.
+# Empty matches for the pattern are replaced only when not adjacent to
+# a previous match, so e.g. gsub('', '-', 'abc') returns '-a-b-c-'.
+
+def gsub(pat, repl, str):
+       prog = compile(pat)
+       new = ''
+       start = 0
+       first = 1
+       while prog.search(str, start) >= 0:
+               regs = prog.regs
+               a, b = regs[0]
+               if a == b == start and not first:
+                       if start >= len(str) or prog.search(str, start+1) < 0:
+                               break
+                       regs = prog.regs
+                       a, b = regs[0]
+               new = new + str[start:a] + expand(repl, regs, str)
+               start = b
+               first = 0
+       new = new + str[start:]
+       return new
+
+
+# Split string str in fields separated by delimiters matching pattern
+# pat.  Only non-empty matches for the pattern are considered, so e.g.
+# split('abc', '') returns ['abc'].
+
+def split(str, pat):
+       prog = compile(pat)
+       res = []
+       start = next = 0
+       while prog.search(str, next) >= 0:
+               regs = prog.regs
+               a, b = regs[0]
+               if a == b:
+                       next = next + 1
+                       if next >= len(str):
+                               break
+               else:
+                       res.append(str[start:a])
+                       start = next = b
+       res.append(str[start:])
+       return res
+
+
+# Internal subroutines:
+# compile(pat): compile a pattern, caching already compiled patterns
+# expand(repl, regs, str): expand \digit escapes in replacement string
+
+
+# Manage a cache of compiled regular expressions.
+# If the pattern is a string a compiled version of it is returned.
+# If the pattern has been used before we return an already compiled
+# version from the cache; otherwise we compile it now and save the
+# compiled version in the cache.
+# Instead of a string, a compiled regular expression can also be
+# passed.
+# WARNING: if the pattern syntax is changed, the cache should be
+# flushed!
+
+cache = {}
+
+def compile(pat):
+       if type(pat) <> type(''):
+               return pat              # Assume it is a compiled regex
+       if cache.has_key(pat):
+               prog = cache[pat]       # Get it from the cache
+       else:
+               prog = cache[pat] = regex.compile(pat)
+       return prog
+
+
+# Expand \digit in the replacement.
+# Each occurrence of \digit is replaced by the substring of str
+# indicated by regs[digit].  To include a literal \ in the
+# replacement, double it; other \ escapes are left unchanged (i.e.
+# the \ and the following character are both copied).
+
+def expand(repl, regs, str):
+       if '\\' not in repl:
+               return repl
+       new = ''
+       i = 0
+       while i < len(repl):
+               c = repl[i]; i = i+1
+               if c <> '\\' or i >= len(repl):
+                       new = new + c
+               else:
+                       c = repl[i]; i = i+1
+                       if '0' <= c <= '9':
+                               a, b = regs[eval(c)]
+                               new = new + str[a:b]
+                       elif c == '\\':
+                               new = new + c
+                       else:
+                               new = new + '\\' + c
+       return new
+
+
+# Test program, reads sequences "pat repl str" from stdin.
+# Optional argument specifies pattern used to split lines.
+
+def test():
+       import sys
+       if sys.argv[1:]:
+               delpat = sys.argv[1]
+       else:
+               delpat = '[ \t\n]+'
+       while 1:
+               if sys.stdin.isatty(): sys.stderr.write('--> ')
+               line = sys.stdin.readline()
+               if not line: break
+               if line[-1] == '\n': line = line[:-1]
+               fields = split(line, delpat)
+               if len(fields) <> 3:
+                       print 'Sorry, not three fields'
+                       print 'split:', `fields`
+                       continue
+               [pat, repl, str] = split(line, delpat)
+               print 'sub :', `sub(pat, repl, str)`
+               print 'gsub:', `gsub(pat, repl, str)`
diff --git a/Lib/string.py b/Lib/string.py

index f358ac4778833052bd725b533adaa37b969925c4..6386a9549dd94f2b47c4f3ebeb21886afcfb1f4f 100644 (file)
--- a/Lib/string.py
+++ b/Lib/string.py
@@ -63,13 +63,12 @@ def split(s):
  
  # Split a list into fields separated by a given string
  # NB: splitfields(s, ' ') is NOT the same as split(s)!
-# splitfields(s, '') is illegal
-splitfields_error = 'string.splitfields called with empty separator'
+# splitfields(s, '') returns [s] (in analogy with split() in nawk)
  def splitfields(s, sep):
         res = []
         nsep = len(sep)
         if nsep == 0:
-               raise splitfields_error
+               return [s]
         ns = len(s)
         i = j = 0
         while j+nsep <= ns:
diff --git a/Lib/stringold.py b/Lib/stringold.py

index f358ac4778833052bd725b533adaa37b969925c4..6386a9549dd94f2b47c4f3ebeb21886afcfb1f4f 100644 (file)
--- a/Lib/stringold.py
+++ b/Lib/stringold.py
@@ -63,13 +63,12 @@ def split(s):
  
  # Split a list into fields separated by a given string
  # NB: splitfields(s, ' ') is NOT the same as split(s)!
-# splitfields(s, '') is illegal
-splitfields_error = 'string.splitfields called with empty separator'
+# splitfields(s, '') returns [s] (in analogy with split() in nawk)
  def splitfields(s, sep):
         res = []
         nsep = len(sep)
         if nsep == 0:
-               raise splitfields_error
+               return [s]
         ns = len(s)
         i = j = 0
         while j+nsep <= ns:
author	Guido van Rossum <guido@python.org>
	Sun, 20 Sep 1992 21:41:09 +0000 (21:41 +0000)
committer	Guido van Rossum <guido@python.org>
	Sun, 20 Sep 1992 21:41:09 +0000 (21:41 +0000)
Lib/os.py		patch \| blob \| blame \| history
Lib/regsub.py	[new file with mode: 0644]	patch \| blob
Lib/string.py		patch \| blob \| blame \| history
Lib/stringold.py		patch \| blob \| blame \| history