]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
Moved reader \r and \n processing from the iterator to the state machine -
authorAndrew McNamara <andrewm@object-craft.com.au>
Thu, 13 Jan 2005 11:30:54 +0000 (11:30 +0000)
committerAndrew McNamara <andrewm@object-craft.com.au>
Thu, 13 Jan 2005 11:30:54 +0000 (11:30 +0000)
this allows for better handling of newline characters in quoted fields (and
hopefully resolves Bug 967934).

Misc/NEWS
Modules/_csv.c

index 5ad39e29dc8b038b0d7b5a882e498571073a1482..6385157b9c956a04b6a53ec82e855cab8b0952e9 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -48,10 +48,11 @@ Library
     dictates.
   + the parser now removes the escapechar prefix from escaped characters.
   + when quoting=QUOTE_NONNUMERIC, the writer now tests for numeric
-    objects, rather than attempting to cast to float, and using the
-    success of that as the determinator.
+    types, rather than any object than can be represented as a numeric.
   + when quoting=QUOTE_NONNUMERIC, the reader now casts unquoted fields
     to floats.
+  + reader now allows \r characters to be quoted (previously it only allowed
+    \n to be quoted).
   + writer doublequote handling improved.
   + Dialect classes passed to the module are no longer instantiated by
     the module before being parsed (the former validation scheme required
index 8547d3c03b20c254321c3380f9e205254a44d31a..638079293ca4060d7a8d31c0f665bc7375183d8e 100644 (file)
@@ -48,7 +48,8 @@ static long field_limit = 128 * 1024; /* max parsed field size */
 
 typedef enum {
        START_RECORD, START_FIELD, ESCAPED_CHAR, IN_FIELD, 
-       IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD
+       IN_QUOTED_FIELD, ESCAPE_IN_QUOTED_FIELD, QUOTE_IN_QUOTED_FIELD,
+       EAT_CRNL
 } ParserState;
 
 typedef enum {
@@ -96,7 +97,6 @@ typedef struct {
        char *field;            /* build current field in here */
        int field_size;         /* size of allocated buffer */
        int field_len;          /* length of current field */
-       int had_parse_error;    /* did we have a parse error? */
        int numeric_field;      /* treat field as numeric */
        unsigned long line_num; /* Source-file line number */
 } ReaderObj;
@@ -497,6 +497,9 @@ _call_dialect(PyObject *dialect_inst, PyObject *kwargs)
        return dialect;
 }
 
+/*
+ * READER
+ */
 static int
 parse_save_field(ReaderObj *self)
 {
@@ -543,22 +546,6 @@ parse_grow_buff(ReaderObj *self)
        return 1;
 }
 
-static int
-parse_reset(ReaderObj *self)
-{
-       if (self->fields) {
-               Py_DECREF(self->fields);
-       }
-       self->fields = PyList_New(0);
-       if (self->fields == NULL)
-               return -1;
-       self->field_len = 0;
-       self->state = START_RECORD;
-       self->had_parse_error = 0;
-       self->numeric_field = 0;
-       return 0;
-}
-
 static int
 parse_add_char(ReaderObj *self, char c)
 {
@@ -581,19 +568,23 @@ parse_process_char(ReaderObj *self, char c)
        switch (self->state) {
        case START_RECORD:
                /* start of record */
-               if (c == '\n')
+               if (c == '\0')
                        /* empty line - return [] */
                        break;
+               else if (c == '\n' || c == '\r') {
+                       self->state = EAT_CRNL;
+                       break;
+               }
                /* normal character - handle as START_FIELD */
                self->state = START_FIELD;
                /* fallthru */
        case START_FIELD:
                /* expecting field */
-               if (c == '\n') {
+               if (c == '\n' || c == '\r' || c == '\0') {
                        /* save empty field - return [fields] */
                        if (parse_save_field(self) < 0)
                                return -1;
-                       self->state = START_RECORD;
+                       self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
                }
                else if (c == dialect->quotechar && 
                         dialect->quoting != QUOTE_NONE) {
@@ -623,6 +614,8 @@ parse_process_char(ReaderObj *self, char c)
                break;
 
        case ESCAPED_CHAR:
+               if (c == '\0')
+                       c = '\n';
                if (parse_add_char(self, c) < 0)
                        return -1;
                self->state = IN_FIELD;
@@ -630,11 +623,11 @@ parse_process_char(ReaderObj *self, char c)
 
        case IN_FIELD:
                /* in unquoted field */
-               if (c == '\n') {
+               if (c == '\n' || c == '\r' || c == '\0') {
                        /* end of line - return [fields] */
                        if (parse_save_field(self) < 0)
                                return -1;
-                       self->state = START_RECORD;
+                       self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
                }
                else if (c == dialect->escapechar) {
                        /* possible escaped character */
@@ -655,11 +648,8 @@ parse_process_char(ReaderObj *self, char c)
 
        case IN_QUOTED_FIELD:
                /* in quoted field */
-               if (c == '\n') {
-                       /* end of line - save '\n' in field */
-                       if (parse_add_char(self, '\n') < 0)
-                               return -1;
-               }
+               if (c == '\0')
+                       ;
                else if (c == dialect->escapechar) {
                        /* Possible escape character */
                        self->state = ESCAPE_IN_QUOTED_FIELD;
@@ -683,6 +673,8 @@ parse_process_char(ReaderObj *self, char c)
                break;
 
        case ESCAPE_IN_QUOTED_FIELD:
+               if (c == '\0')
+                       c = '\n';
                if (parse_add_char(self, c) < 0)
                        return -1;
                self->state = IN_QUOTED_FIELD;
@@ -703,11 +695,11 @@ parse_process_char(ReaderObj *self, char c)
                                return -1;
                        self->state = START_FIELD;
                }
-               else if (c == '\n') {
+               else if (c == '\n' || c == '\r' || c == '\0') {
                        /* end of line - return [fields] */
                        if (parse_save_field(self) < 0)
                                return -1;
-                       self->state = START_RECORD;
+                       self->state = (c == '\0' ? START_RECORD : EAT_CRNL);
                }
                else if (!dialect->strict) {
                        if (parse_add_char(self, c) < 0)
@@ -716,7 +708,6 @@ parse_process_char(ReaderObj *self, char c)
                }
                else {
                        /* illegal */
-                       self->had_parse_error = 1;
                        PyErr_Format(error_obj, "'%c' expected after '%c'", 
                                        dialect->delimiter, 
                                         dialect->quotechar);
@@ -724,104 +715,83 @@ parse_process_char(ReaderObj *self, char c)
                }
                break;
 
+       case EAT_CRNL:
+               if (c == '\n' || c == '\r')
+                       ;
+               else if (c == '\0')
+                       self->state = START_RECORD;
+               else {
+                       PyErr_Format(error_obj, "new-line character seen in unquoted field - do you need to open the file in universal-newline mode?");
+                       return -1;
+               }
+               break;
+
        }
        return 0;
 }
 
-/*
- * READER
- */
-#define R_OFF(x) offsetof(ReaderObj, x)
-
-static struct PyMemberDef Reader_memberlist[] = {
-       { "dialect", T_OBJECT, R_OFF(dialect), RO },
-       { "line_num", T_ULONG, R_OFF(line_num), RO },
-       { NULL }
-};
+static int
+parse_reset(ReaderObj *self)
+{
+       Py_XDECREF(self->fields);
+       self->fields = PyList_New(0);
+       if (self->fields == NULL)
+               return -1;
+       self->field_len = 0;
+       self->state = START_RECORD;
+       self->numeric_field = 0;
+       return 0;
+}
 
 static PyObject *
 Reader_iternext(ReaderObj *self)
 {
         PyObject *lineobj;
-        PyObject *fields;
-        char *line;
+        PyObject *fields = NULL;
+        char *line, c;
+       int linelen;
 
+       if (parse_reset(self) < 0)
+               return NULL;
         do {
                 lineobj = PyIter_Next(self->input_iter);
                 if (lineobj == NULL) {
                         /* End of input OR exception */
                         if (!PyErr_Occurred() && self->field_len != 0)
-                                return PyErr_Format(error_obj,
-                                                    "newline inside string");
+                                PyErr_Format(error_obj,
+                                            "newline inside string");
                         return NULL;
                 }
                ++self->line_num;
 
-                if (self->had_parse_error)
-                       if (parse_reset(self) < 0) {
-                               Py_DECREF(lineobj);
-                               return NULL;
-                       }
                 line = PyString_AsString(lineobj);
+               linelen = PyString_Size(lineobj);
 
-                if (line == NULL) {
+                if (line == NULL || linelen < 0) {
                         Py_DECREF(lineobj);
                         return NULL;
                 }
-               if (strlen(line) < (size_t)PyString_GET_SIZE(lineobj)) {
-                       self->had_parse_error = 1;
-                       Py_DECREF(lineobj);
-                       return PyErr_Format(error_obj,
-                                           "string with NUL bytes");
-               }
-
-                /* Process line of text - send '\n' to processing code to
-                represent end of line.  End of line which is not at end of
-                string is an error. */
-                while (*line) {
-                        char c;
-
-                        c = *line++;
-                        if (c == '\r') {
-                                c = *line++;
-                                if (c == '\0')
-                                        /* macintosh end of line */
-                                        break;
-                                if (c == '\n') {
-                                        c = *line++;
-                                        if (c == '\0')
-                                                /* DOS end of line */
-                                                break;
-                                }
-                                self->had_parse_error = 1;
-                                Py_DECREF(lineobj);
-                                return PyErr_Format(error_obj,
-                                                    "newline inside string");
-                        }
-                        if (c == '\n') {
-                                c = *line++;
-                                if (c == '\0')
-                                        /* unix end of line */
-                                        break;
-                                self->had_parse_error = 1;
-                                Py_DECREF(lineobj);
-                                return PyErr_Format(error_obj, 
-                                                    "newline inside string");
-                        }
+                while (linelen--) {
+                       c = *line++;
+                       if (c == '\0') {
+                               Py_DECREF(lineobj);
+                               PyErr_Format(error_obj,
+                                            "line contains NULL byte");
+                               goto err;
+                       }
                        if (parse_process_char(self, c) < 0) {
                                Py_DECREF(lineobj);
-                               return NULL;
+                               goto err;
                        }
                }
-               if (parse_process_char(self, '\n') < 0) {
-                       Py_DECREF(lineobj);
-                       return NULL;
-               }
                 Py_DECREF(lineobj);
+               if (parse_process_char(self, 0) < 0)
+                       goto err;
         } while (self->state != START_RECORD);
 
         fields = self->fields;
-        self->fields = PyList_New(0);
+        self->fields = NULL;
+err:
         return fields;
 }
 
@@ -875,6 +845,14 @@ PyDoc_STRVAR(Reader_Type_doc,
 static struct PyMethodDef Reader_methods[] = {
        { NULL, NULL }
 };
+#define R_OFF(x) offsetof(ReaderObj, x)
+
+static struct PyMemberDef Reader_memberlist[] = {
+       { "dialect", T_OBJECT, R_OFF(dialect), RO },
+       { "line_num", T_ULONG, R_OFF(line_num), RO },
+       { NULL }
+};
+
 
 static PyTypeObject Reader_Type = {
        PyObject_HEAD_INIT(NULL)