This patch changes the behaviour of the UTF-16 codec family. Only the

author Marc-André Lemburg <mal@egenix.com>

Mon, 21 May 2001 20:30:15 +0000 (20:30 +0000)

committer Marc-André Lemburg <mal@egenix.com>

Mon, 21 May 2001 20:30:15 +0000 (20:30 +0000)
author Marc-André Lemburg <mal@egenix.com>
Mon, 21 May 2001 20:30:15 +0000 (20:30 +0000)
committer Marc-André Lemburg <mal@egenix.com>
Mon, 21 May 2001 20:30:15 +0000 (20:30 +0000)
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index 988ea1b39ec8af35b826d40d685c1ad313c6452c..f91a5a0c8c7f35e0b7fbd8bb3b83e437bd89084e 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -459,10 +459,11 @@ extern DL_IMPORT(PyObject*) PyUnicode_EncodeUTF8(
         *byteorder == 0:  native order
         *byteorder == 1:  big endian
  
-   and then switches according to all BOM marks it finds in the input
-   data. BOM marks are not copied into the resulting Unicode string.
-   After completion, *byteorder is set to the current byte order at
-   the end of input data.
+   In native mode, the first two bytes of the stream are checked for a
+   BOM mark. If found, the BOM mark is analysed, the byte order
+   adjusted and the BOM skipped.  In the other modes, no BOM mark
+   interpretation is done. After completion, *byteorder is set to the
+   current byte order at the end of input data.
  
     If byteorder is NULL, the codec starts in native order mode.
  
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 475215c25f209c9b337c1650ea00596064397bea..d55e2a72e206eebad9a9e5b2648c473ac7409cf8 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -1001,31 +1001,39 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
      if (byteorder)
         bo = *byteorder;
  
-    while (q < e) {
-       register Py_UNICODE ch = *q++;
-
-       /* Check for BOM marks (U+FEFF) in the input and adjust
-          current byte order setting accordingly. Swap input
-          bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
-          !) */
+    /* Check for BOM marks (U+FEFF) in the input and adjust current
+       byte order setting accordingly. In native mode, the leading BOM
+       mark is skipped, in all other modes, it is copied to the output
+       stream as-is (giving a ZWNBSP character). */
+    if (bo == 0) {
  #ifdef BYTEORDER_IS_LITTLE_ENDIAN
-       if (ch == 0xFEFF) {
+       if (*q == 0xFEFF) {
+           q++;
             bo = -1;
-           continue;
-       } else if (ch == 0xFFFE) {
+       } else if (*q == 0xFFFE) {
+           q++;
             bo = 1;
-           continue;
         }
-       if (bo == 1)
-           ch = (ch >> 8) | (ch << 8);
  #else    
-       if (ch == 0xFEFF) {
+       if (*q == 0xFEFF) {
+           q++;
             bo = 1;
-           continue;
-       } else if (ch == 0xFFFE) {
+       } else if (*q == 0xFFFE) {
+           q++;
             bo = -1;
-           continue;
         }
+#endif
+    }
+    
+    while (q < e) {
+       register Py_UNICODE ch = *q++;
+
+       /* Swap input bytes if needed. (This assumes
+          sizeof(Py_UNICODE) == 2 !) */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+       if (bo == 1)
+           ch = (ch >> 8) | (ch << 8);
+#else    
         if (bo == -1)
             ch = (ch >> 8) | (ch << 8);
  #endif
author	Marc-André Lemburg <mal@egenix.com>
	Mon, 21 May 2001 20:30:15 +0000 (20:30 +0000)
committer	Marc-André Lemburg <mal@egenix.com>
	Mon, 21 May 2001 20:30:15 +0000 (20:30 +0000)
Include/unicodeobject.h		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history