Add rsplit method for str and unicode builtin types.

author Hye-Shik Chang <hyeshik@gmail.com>

Mon, 15 Dec 2003 18:49:53 +0000 (18:49 +0000)

committer Hye-Shik Chang <hyeshik@gmail.com>

Mon, 15 Dec 2003 18:49:53 +0000 (18:49 +0000)
author Hye-Shik Chang <hyeshik@gmail.com>
Mon, 15 Dec 2003 18:49:53 +0000 (18:49 +0000)
committer Hye-Shik Chang <hyeshik@gmail.com>
Mon, 15 Dec 2003 18:49:53 +0000 (18:49 +0000)
diff --git a/Doc/lib/libstdtypes.tex b/Doc/lib/libstdtypes.tex

index 7feda604734d482062dfbb93183005084fc9a87f..4ce6ec5dd3dbc50b5544b04964510d81b675ecdc 100644 (file)
--- a/Doc/lib/libstdtypes.tex
+++ b/Doc/lib/libstdtypes.tex
@@ -694,6 +694,24 @@ The original string is returned if
  \versionchanged[Support for the \var{fillchar} argument]{2.4}
  \end{methoddesc}
  
+\begin{methoddesc}[string]{rsplit}{\optional{, sep\optional{, maxsplit}}}
+Return a list of the words of the string, scanning the string from
+the end working forward.  The resulting list of words is in the
+same order as \function{split()}.  If the optional second argument
+\var{sep} is absent or \code{None}, the words are separated by
+arbitrary strings of whitespace characters (space, tab, newline,
+return, formfeed).  If the second argument \var{sep} is present and
+not \code{None}, it specifies a string to be used as the word
+separator.  The returned list will then have one more item than the
+number of non-overlapping occurrences of the separator in the string.
+The optional third argument \var{maxsplit} defaults to 0.  If it
+is nonzero, at most \var{maxsplit} number of splits occur, and the
+remainder of the string is returned as the first element of the
+list (thus, the list will have at most \code{\var{maxsplit}+1}
+elements).
+\versionadded{2.4}
+\end{methoddesc}
+
  \begin{methoddesc}[string]{rstrip}{\optional{chars}}
  Return a copy of the string with trailing characters removed.  If
  \var{chars} is omitted or \code{None}, whitespace characters are
diff --git a/Doc/lib/libstring.tex b/Doc/lib/libstring.tex

index 3f902cf8403fd13c7ca6101906fcce8337558f19..11054e2614f0d080a5eebc1ad58e320c1b2bc407 100644 (file)
--- a/Doc/lib/libstring.tex
+++ b/Doc/lib/libstring.tex
@@ -215,6 +215,23 @@ The functions defined in this module are:
    elements).
  \end{funcdesc}
  
+\begin{funcdesc}{rsplit}{s\optional{, sep\optional{, maxsplit}}}
+  Return a list of the words of the string \var{s}, scanning \var{s} from
+  the end working forward.  The resulting list of words is in the same
+  order as \function{split()}.  If the optional second argument \var{sep}
+  is absent or \code{None}, the words are separated by arbitrary strings
+  of whitespace characters (space, tab, newline, return, formfeed).
+  If the second argument \var{sep} is present and not \code{None}, it
+  specifies a string to be used as the word separator.  The returned
+  list will then have one more item than the number of non-overlapping
+  occurrences of the separator in the string.  The optional third argument
+  \var{maxsplit} defaults to 0.  If it is nonzero, at most \var{maxsplit}
+  number of splits occur, and the remainder of the string is returned
+  as the first element of the list (thus, the list will have at most
+  \code{\var{maxsplit}+1} elements).
+  \versionadded{2.4}
+\end{funcdesc}
+
  \begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
    This function behaves identically to \function{split()}.  (In the
    past, \function{split()} was only used with one argument, while
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index 24e59de4ce00fe52c77ea3a804869f5a3ae7d1aa..1690f8a9e0d8ff0805a5af7dc878f7a23ad2c635 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -185,6 +185,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
  # define PyUnicode_Resize PyUnicodeUCS2_Resize
  # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
  # define PyUnicode_Split PyUnicodeUCS2_Split
+# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
  # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
  # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
  # define PyUnicode_Translate PyUnicodeUCS2_Translate
@@ -959,6 +960,25 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
      int keepends               /* If true, line end markers are included */
      );         
  
+/* Split a string giving a list of Unicode strings.
+
+   If sep is NULL, splitting will be done at all whitespace
+   substrings. Otherwise, splits occur at the given separator.
+
+   At most maxsplit splits will be done. But unlike PyUnicode_Split
+   PyUnicode_RSplit splits from the end of the string. If negative,
+   no limit is set.
+
+   Separators are not included in the resulting list.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
+    PyObject *s,               /* String to split */
+    PyObject *sep,             /* String separator */
+    int maxsplit               /* Maxsplit count */
+    );         
+
  /* Translate a string by applying a character mapping table to it and
     return the resulting Unicode object.
  
diff --git a/Lib/string.py b/Lib/string.py

index 0a77f461ffda09b4ddf9e0cb2c45414b91f6d225..bc10c20337a57704583d6ae9417ef4baad2a1cb2 100644 (file)
--- a/Lib/string.py
+++ b/Lib/string.py
@@ -121,6 +121,18 @@ def split(s, sep=None, maxsplit=-1):
      return s.split(sep, maxsplit)
  splitfields = split
  
+# Split a string into a list of space/tab-separated words
+def rsplit(s, sep=None, maxsplit=-1):
+    """rsplit(s [,sep [,maxsplit]]) -> list of strings
+
+    Return a list of the words in the string s, using sep as the
+    delimiter string, starting at the end of the string and working
+    to the front.  If maxsplit is given, at most maxsplit splits are
+    done. If sep is not specified or is None, any whitespace string
+    is a separator.
+    """
+    return s.rsplit(sep, maxsplit)
+
  # Join fields with optional separator
  def join(words, sep = ' '):
      """join(list [,sep]) -> string
diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py

index 236c577607cd11080fd71ff73fc9f2afe22182f3..c3e53ad48b9aa5e14902bd8bb60152443c9f8c80 100644 (file)
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@@ -189,6 +189,26 @@ class CommonTest(unittest.TestCase):
  
          self.checkraises(TypeError, 'hello', 'split', 42, 42, 42)
  
+    def test_rsplit(self):
+        self.checkequal(['this', 'is', 'the', 'rsplit', 'function'],
+                         'this is the rsplit function', 'rsplit')
+        self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|')
+        self.checkequal(['a|b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|', 2)
+        self.checkequal(['a b c', 'd'], 'a b c d', 'rsplit', None, 1)
+        self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2)
+        self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 3)
+        self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 4)
+        self.checkequal(['a b c d'], 'a b c d', 'rsplit', None, 0)
+        self.checkequal(['a, b, c', 'd'], 'a, b, c, d', 'rsplit', ', ', 1)
+        self.checkequal(['a, b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 2)
+        self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 3)
+        self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 4)
+        self.checkequal(['a, b, c, d'], 'a, b, c, d', 'rsplit', ', ', 0)
+        self.checkequal(['a  b', 'c', 'd'], 'a  b  c  d', 'rsplit', None, 2)
+        self.checkequal(['a\x00b', 'c'], 'a\x00b\x00c', 'rsplit', '\x00', 1)
+        self.checkequal(['', ''], 'abcd', 'rsplit', 'abcd')
+        self.checkequal([u'a b', u'c', u'd'], 'a b c d', 'rsplit', u' ', 2)
+
      def test_strip(self):
          self.checkequal('hello', '   hello   ', 'strip')
          self.checkequal('hello   ', '   hello   ', 'lstrip')
diff --git a/Objects/stringobject.c b/Objects/stringobject.c

index d3351df448a16fc6b179c512188daa4e8a68608c..951205961755ecf01d92f53446a21ee9f36ac730 100644 (file)
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@@ -1407,6 +1407,129 @@ string_split(PyStringObject *self, PyObject *args)
         return NULL;
  }
  
+static PyObject *
+rsplit_whitespace(const char *s, int len, int maxsplit)
+{
+       int i, j, err;
+       PyObject* item;
+       PyObject *list = PyList_New(0);
+
+       if (list == NULL)
+               return NULL;
+
+       for (i = j = len - 1; i >= 0; ) {
+               while (i >= 0 && isspace(Py_CHARMASK(s[i])))
+                       i--;
+               j = i;
+               while (i >= 0 && !isspace(Py_CHARMASK(s[i])))
+                       i--;
+               if (j > i) {
+                       if (maxsplit-- <= 0)
+                               break;
+                       item = PyString_FromStringAndSize(s+i+1, (int)(j-i));
+                       if (item == NULL)
+                               goto finally;
+                       err = PyList_Insert(list, 0, item);
+                       Py_DECREF(item);
+                       if (err < 0)
+                               goto finally;
+                       while (i >= 0 && isspace(Py_CHARMASK(s[i])))
+                               i--;
+                       j = i;
+               }
+       }
+       if (j >= 0) {
+               item = PyString_FromStringAndSize(s, (int)(j + 1));
+               if (item == NULL)
+                       goto finally;
+               err = PyList_Insert(list, 0, item);
+               Py_DECREF(item);
+               if (err < 0)
+                       goto finally;
+       }
+       return list;
+  finally:
+       Py_DECREF(list);
+       return NULL;
+}
+
+
+PyDoc_STRVAR(rsplit__doc__,
+"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
+\n\
+Return a list of the words in the string S, using sep as the\n\
+delimiter string, starting at the end of the string and working\n\
+to the front.  If maxsplit is given, at most maxsplit splits are\n\
+done. If sep is not specified or is None, any whitespace string\n\
+is a separator.");
+
+static PyObject *
+string_rsplit(PyStringObject *self, PyObject *args)
+{
+       int len = PyString_GET_SIZE(self), n, i, j, err;
+       int maxsplit = -1;
+       const char *s = PyString_AS_STRING(self), *sub;
+       PyObject *list, *item, *subobj = Py_None;
+
+       if (!PyArg_ParseTuple(args, "|Oi:rsplit", &subobj, &maxsplit))
+               return NULL;
+       if (maxsplit < 0)
+               maxsplit = INT_MAX;
+       if (subobj == Py_None)
+               return rsplit_whitespace(s, len, maxsplit);
+       if (PyString_Check(subobj)) {
+               sub = PyString_AS_STRING(subobj);
+               n = PyString_GET_SIZE(subobj);
+       }
+#ifdef Py_USING_UNICODE
+       else if (PyUnicode_Check(subobj))
+               return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
+#endif
+       else if (PyObject_AsCharBuffer(subobj, &sub, &n))
+               return NULL;
+       if (n == 0) {
+               PyErr_SetString(PyExc_ValueError, "empty separator");
+               return NULL;
+       }
+
+       list = PyList_New(0);
+       if (list == NULL)
+               return NULL;
+
+       j = len;
+       i = j - n;
+       while (i >= 0) {
+               if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) {
+                       if (maxsplit-- <= 0)
+                               break;
+                       item = PyString_FromStringAndSize(s+i+n, (int)(j-i-n));
+                       if (item == NULL)
+                               goto fail;
+                       err = PyList_Insert(list, 0, item);
+                       Py_DECREF(item);
+                       if (err < 0)
+                               goto fail;
+                       j = i;
+                       i -= n;
+               }
+               else
+                       i--;
+       }
+       item = PyString_FromStringAndSize(s, j);
+       if (item == NULL)
+               goto fail;
+       err = PyList_Insert(list, 0, item);
+       Py_DECREF(item);
+       if (err < 0)
+               goto fail;
+
+       return list;
+
+ fail:
+       Py_DECREF(list);
+       return NULL;
+}
+
  
  PyDoc_STRVAR(join__doc__,
  "S.join(sequence) -> string\n\
@@ -3064,6 +3187,7 @@ string_methods[] = {
            string.maketrans(). */
         {"join", (PyCFunction)string_join, METH_O, join__doc__},
         {"split", (PyCFunction)string_split, METH_VARARGS, split__doc__},
+       {"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__},
         {"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__},
         {"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__},
         {"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__},
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index f87d74990bbaf4f964917031da8848f93d579adb..19d0353e8a8cdbf5d49dd3cc76368d2ebec090c4 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -4053,7 +4053,7 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
  }
  
  #define SPLIT_APPEND(data, left, right)                                        \
-       str = PyUnicode_FromUnicode(data + left, right - left);         \
+       str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
         if (!str)                                                       \
             goto onError;                                               \
         if (PyList_Append(list, str)) {                                 \
@@ -4063,6 +4063,17 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
          else                                                           \
              Py_DECREF(str);
  
+#define SPLIT_INSERT(data, left, right)                                        \
+       str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
+       if (!str)                                                       \
+           goto onError;                                               \
+       if (PyList_Insert(list, 0, str)) {                              \
+           Py_DECREF(str);                                             \
+           goto onError;                                               \
+       }                                                               \
+        else                                                           \
+            Py_DECREF(str);
+
  static
  PyObject *split_whitespace(PyUnicodeObject *self,
                            PyObject *list,
@@ -4214,7 +4225,106 @@ PyObject *split_substring(PyUnicodeObject *self,
      return NULL;
  }
  
+static
+PyObject *rsplit_whitespace(PyUnicodeObject *self,
+                           PyObject *list,
+                           int maxcount)
+{
+    register int i;
+    register int j;
+    int len = self->length;
+    PyObject *str;
+
+    for (i = j = len - 1; i >= 0; ) {
+       /* find a token */
+       while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
+           i--;
+       j = i;
+       while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
+           i--;
+       if (j > i) {
+           if (maxcount-- <= 0)
+               break;
+           SPLIT_INSERT(self->str, i + 1, j + 1);
+           while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
+               i--;
+           j = i;
+       }
+    }
+    if (j >= 0) {
+       SPLIT_INSERT(self->str, 0, j + 1);
+    }
+    return list;
+
+ onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
+static 
+PyObject *rsplit_char(PyUnicodeObject *self,
+                     PyObject *list,
+                     Py_UNICODE ch,
+                     int maxcount)
+{
+    register int i;
+    register int j;
+    int len = self->length;
+    PyObject *str;
+
+    for (i = j = len - 1; i >= 0; ) {
+       if (self->str[i] == ch) {
+           if (maxcount-- <= 0)
+               break;
+           SPLIT_INSERT(self->str, i + 1, j + 1);
+           j = i = i - 1;
+       } else
+           i--;
+    }
+    if (j >= 0) {
+       SPLIT_INSERT(self->str, 0, j + 1);
+    }
+    return list;
+
+ onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
+static 
+PyObject *rsplit_substring(PyUnicodeObject *self,
+                          PyObject *list,
+                          PyUnicodeObject *substring,
+                          int maxcount)
+{
+    register int i;
+    register int j;
+    int len = self->length;
+    int sublen = substring->length;
+    PyObject *str;
+
+    for (i = len - sublen, j = len; i >= 0; ) {
+       if (Py_UNICODE_MATCH(self, i, substring)) {
+           if (maxcount-- <= 0)
+               break;
+           SPLIT_INSERT(self->str, i + sublen, j);
+           j = i;
+           i -= sublen;
+       } else
+           i--;
+    }
+    if (j >= 0) {
+       SPLIT_INSERT(self->str, 0, j);
+    }
+    return list;
+
+ onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
  #undef SPLIT_APPEND
+#undef SPLIT_INSERT
  
  static
  PyObject *split(PyUnicodeObject *self,
@@ -4245,6 +4355,35 @@ PyObject *split(PyUnicodeObject *self,
         return split_substring(self,list,substring,maxcount);
  }
  
+static
+PyObject *rsplit(PyUnicodeObject *self,
+                PyUnicodeObject *substring,
+                int maxcount)
+{
+    PyObject *list;
+
+    if (maxcount < 0)
+        maxcount = INT_MAX;
+
+    list = PyList_New(0);
+    if (!list)
+        return NULL;
+
+    if (substring == NULL)
+       return rsplit_whitespace(self,list,maxcount);
+
+    else if (substring->length == 1)
+       return rsplit_char(self,list,substring->str[0],maxcount);
+
+    else if (substring->length == 0) {
+       Py_DECREF(list);
+       PyErr_SetString(PyExc_ValueError, "empty separator");
+       return NULL;
+    }
+    else
+       return rsplit_substring(self,list,substring,maxcount);
+}
+
  static
  PyObject *replace(PyUnicodeObject *self,
                   PyUnicodeObject *str1,
@@ -5675,6 +5814,56 @@ unicode_split(PyUnicodeObject *self, PyObject *args)
         return PyUnicode_Split((PyObject *)self, substring, maxcount);
  }
  
+PyObject *PyUnicode_RSplit(PyObject *s,
+                          PyObject *sep,
+                          int maxsplit)
+{
+    PyObject *result;
+    
+    s = PyUnicode_FromObject(s);
+    if (s == NULL)
+       return NULL;
+    if (sep != NULL) {
+       sep = PyUnicode_FromObject(sep);
+       if (sep == NULL) {
+           Py_DECREF(s);
+           return NULL;
+       }
+    }
+
+    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
+
+    Py_DECREF(s);
+    Py_XDECREF(sep);
+    return result;
+}
+
+PyDoc_STRVAR(rsplit__doc__,
+"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
+\n\
+Return a list of the words in S, using sep as the\n\
+delimiter string, starting at the end of the string and\n\
+working to the front.  If maxsplit is given, at most maxsplit\n\
+splits are done. If sep is not specified, any whitespace string\n\
+is a separator.");
+
+static PyObject*
+unicode_rsplit(PyUnicodeObject *self, PyObject *args)
+{
+    PyObject *substring = Py_None;
+    int maxcount = -1;
+
+    if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
+        return NULL;
+
+    if (substring == Py_None)
+       return rsplit(self, NULL, maxcount);
+    else if (PyUnicode_Check(substring))
+       return rsplit(self, (PyUnicodeObject *)substring, maxcount);
+    else
+       return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
+}
+
  PyDoc_STRVAR(splitlines__doc__,
  "S.splitlines([keepends]]) -> list of strings\n\
  \n\
@@ -5870,6 +6059,7 @@ static PyMethodDef unicode_methods[] = {
      {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
      {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
      {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
+    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
      {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
      {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
      {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
author	Hye-Shik Chang <hyeshik@gmail.com>
	Mon, 15 Dec 2003 18:49:53 +0000 (18:49 +0000)
committer	Hye-Shik Chang <hyeshik@gmail.com>
	Mon, 15 Dec 2003 18:49:53 +0000 (18:49 +0000)
Doc/lib/libstdtypes.tex		patch \| blob \| blame \| history
Doc/lib/libstring.tex		patch \| blob \| blame \| history
Include/unicodeobject.h		patch \| blob \| blame \| history
Lib/string.py		patch \| blob \| blame \| history
Lib/test/string_tests.py		patch \| blob \| blame \| history
Objects/stringobject.c		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history