- rework Oracle to no longer do its own unicode conversion; this has been observed

author Mike Bayer <mike_mp@zzzcomputing.com>

Fri, 17 Jan 2014 22:36:43 +0000 (17:36 -0500)

committer Mike Bayer <mike_mp@zzzcomputing.com>

Fri, 17 Jan 2014 22:36:43 +0000 (17:36 -0500)
author Mike Bayer <mike_mp@zzzcomputing.com>
Fri, 17 Jan 2014 22:36:43 +0000 (17:36 -0500)
committer Mike Bayer <mike_mp@zzzcomputing.com>
Fri, 17 Jan 2014 22:36:43 +0000 (17:36 -0500)
diff --git a/doc/build/changelog/changelog_09.rst b/doc/build/changelog/changelog_09.rst

index cfb4a5b24342753ddf8cc2c8e0c3ddb99a871138..369eb6c42ff83f496ba2950cebf11c055c23b39b 100644 (file)
--- a/doc/build/changelog/changelog_09.rst
+++ b/doc/build/changelog/changelog_09.rst
@@ -14,6 +14,43 @@
  .. changelog::
      :version: 0.9.2
  
+    .. change::
+        :tags: bug, oracle
+        :tickets: 2911
+
+        It's been observed that the usage of a cx_Oracle "outputtypehandler"
+        in Python 2.xx in order to coerce string values to Unicode is inordinately
+        expensive; even though cx_Oracle is written in C, when you pass the
+        Python ``unicode`` primitive to cursor.var() and associate with an output
+        handler, the library counts every conversion as a Python function call
+        with all the requisite overhead being recorded; this *despite* the fact
+        when running in Python 3, all strings are also unconditionally coerced
+        to unicode but it does *not* incur this overhead,
+        meaning that cx_Oracle is failing to use performant techniques in Py2K.
+        As SQLAlchemy cannot easily select for this style of type handler on a
+        per-column basis, the handler was assembled unconditionally thereby
+        adding the overhead to all string access.
+
+        So this logic has been replaced with SQLAlchemy's own unicode
+        conversion system, which now
+        only takes effect in Py2K for columns that are requested as unicode.
+        When C extensions are used, SQLAlchemy's system appears to be 2-3x faster than
+        cx_Oracle's.  Additionally, SQLAlchemy's unicode conversion has been
+        enhanced such that when the "conditional" converter is required
+        (now needed for the Oracle backend), the check for "already unicode" is now
+        performed in C and no longer introduces significant overhead.
+
+        This change has two impacts on the cx_Oracle backend.  One is that
+        string values in Py2K which aren't specifically requested with the
+        Unicode type or convert_unicode=True will now come back as ``str``,
+        not ``unicode`` - this behavior is similar to a backend such as
+        MySQL.  Additionally, when unicode values are requested with the cx_Oracle
+        backend, if the C extensions are *not* used, there is now an additional
+        overhead of an isinstance() check per column.  This tradeoff has been
+        made as it can be worked around and no longer places a performance burden
+        on the likely majority of Oracle result columns that are non-unicode
+        strings.
+
      .. change::
          :tags: bug, orm
          :tickets: 2908
diff --git a/lib/sqlalchemy/cextension/processors.c b/lib/sqlalchemy/cextension/processors.c

index c1e68fe0f0b6415dc0b075014cda7a10e482e8ef..d568177634daaec5525a0a141113a480802ab89b 100644 (file)
--- a/lib/sqlalchemy/cextension/processors.c
+++ b/lib/sqlalchemy/cextension/processors.c
@@ -409,6 +409,45 @@ UnicodeResultProcessor_process(UnicodeResultProcessor *self, PyObject *value)
      return PyUnicode_Decode(str, len, encoding, errors);
  }
  
+static PyObject *
+UnicodeResultProcessor_conditional_process(UnicodeResultProcessor *self, PyObject *value)
+{
+    const char *encoding, *errors;
+    char *str;
+    Py_ssize_t len;
+
+    if (value == Py_None)
+        Py_RETURN_NONE;
+
+#if PY_MAJOR_VERSION >= 3
+    if (PyUnicode_Check(value) == 1) {
+        Py_INCREF(value);
+        return value;
+    }
+
+    if (PyBytes_AsStringAndSize(value, &str, &len))
+        return NULL;
+
+    encoding = PyBytes_AS_STRING(self->encoding);
+    errors = PyBytes_AS_STRING(self->errors);
+#else
+
+    if (PyUnicode_Check(value) == 1) {
+        Py_INCREF(value);
+        return value;
+    }
+
+    if (PyString_AsStringAndSize(value, &str, &len))
+        return NULL;
+
+
+    encoding = PyString_AS_STRING(self->encoding);
+    errors = PyString_AS_STRING(self->errors);
+#endif
+
+    return PyUnicode_Decode(str, len, encoding, errors);
+}
+
  static void
  UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
  {
@@ -424,6 +463,8 @@ UnicodeResultProcessor_dealloc(UnicodeResultProcessor *self)
  static PyMethodDef UnicodeResultProcessor_methods[] = {
      {"process", (PyCFunction)UnicodeResultProcessor_process, METH_O,
       "The value processor itself."},
+    {"conditional_process", (PyCFunction)UnicodeResultProcessor_conditional_process, METH_O,
+     "Conditional version of the value processor."},
      {NULL}  /* Sentinel */
  };
  
diff --git a/lib/sqlalchemy/dialects/oracle/cx_oracle.py b/lib/sqlalchemy/dialects/oracle/cx_oracle.py

index c427e4bcae2a78cf875a7ef0e7bb123dd5764b51..599eb21a39373964c6c66f30fd90d077d3686f48 100644 (file)
--- a/lib/sqlalchemy/dialects/oracle/cx_oracle.py
+++ b/lib/sqlalchemy/dialects/oracle/cx_oracle.py
@@ -748,9 +748,6 @@ class OracleDialect_cx_oracle(OracleDialect):
                              255,
                              outconverter=self._detect_decimal,
                              arraysize=cursor.arraysize)
-            # allow all strings to come back natively as Unicode
-            elif defaultType in (cx_Oracle.STRING, cx_Oracle.FIXED_CHAR):
-                return cursor.var(util.text_type, size, cursor.arraysize)
  
          def on_connect(conn):
              conn.outputtypehandler = output_type_handler
diff --git a/lib/sqlalchemy/processors.py b/lib/sqlalchemy/processors.py

index 0abf063b3e0b26eae55b86473792cb1743bf8b4c..d0f52e42b8e345609b42886e8330a5f938d404f9 100644 (file)
--- a/lib/sqlalchemy/processors.py
+++ b/lib/sqlalchemy/processors.py
@@ -15,6 +15,7 @@ They all share one common characteristic: None is passed through unchanged.
  import codecs
  import re
  import datetime
+from . import util
  
  
  def str_to_datetime_processor_factory(regexp, type_):
@@ -66,6 +67,21 @@ def py_fallback():
                  return decoder(value, errors)[0]
          return process
  
+    def to_conditional_unicode_processor_factory(encoding, errors=None):
+        decoder = codecs.getdecoder(encoding)
+
+        def process(value):
+            if value is None:
+                return None
+            elif isinstance(value, util.text_type):
+                return value
+            else:
+                # decoder returns a tuple: (value, len). Simply dropping the
+                # len part is safe: it is done that way in the normal
+                # 'xx'.decode(encoding) code path.
+                return decoder(value, errors)[0]
+        return process
+
      def to_decimal_processor_factory(target_class, scale):
          fstring = "%%.%df" % scale
  
@@ -113,12 +129,17 @@ try:
                                         str_to_date
  
      def to_unicode_processor_factory(encoding, errors=None):
-        # this is cumbersome but it would be even more so on the C side
          if errors is not None:
              return UnicodeResultProcessor(encoding, errors).process
          else:
              return UnicodeResultProcessor(encoding).process
  
+    def to_conditional_unicode_processor_factory(encoding, errors=None):
+        if errors is not None:
+            return UnicodeResultProcessor(encoding, errors).conditional_process
+        else:
+            return UnicodeResultProcessor(encoding).conditional_process
+
      def to_decimal_processor_factory(target_class, scale):
          # Note that the scale argument is not taken into account for integer
          # values in the C implementation while it is in the Python one.
diff --git a/lib/sqlalchemy/sql/sqltypes.py b/lib/sqlalchemy/sql/sqltypes.py

index 702e7736034c6ec2ca8f575e784f4d42aa92ab6e..0cc90f26b882bfbf7df7d8cacfeb2f5c6161e1db 100644 (file)
--- a/lib/sqlalchemy/sql/sqltypes.py
+++ b/lib/sqlalchemy/sql/sqltypes.py
@@ -204,20 +204,11 @@ class String(Concatenable, TypeEngine):
                                      dialect.encoding, self.unicode_error)
  
              if needs_isinstance:
-                # we wouldn't be here unless convert_unicode='force'
-                # was specified, or the driver has erratic unicode-returning
-                # habits.  since we will be getting back unicode
-                # in most cases, we check for it (decode will fail).
-                def process(value):
-                    if isinstance(value, util.text_type):
-                        return value
-                    else:
-                        return to_unicode(value)
-                return process
+                return processors.to_conditional_unicode_processor_factory(
+                                    dialect.encoding, self.unicode_error)
              else:
-                # here, we assume that the object is not unicode,
-                # avoiding expensive isinstance() check.
-                return to_unicode
+                return processors.to_unicode_processor_factory(
+                                    dialect.encoding, self.unicode_error)
          else:
              return None
author	Mike Bayer <mike_mp@zzzcomputing.com>
	Fri, 17 Jan 2014 22:36:43 +0000 (17:36 -0500)
committer	Mike Bayer <mike_mp@zzzcomputing.com>
	Fri, 17 Jan 2014 22:36:43 +0000 (17:36 -0500)
doc/build/changelog/changelog_09.rst		patch \| blob \| blame \| history
lib/sqlalchemy/cextension/processors.c		patch \| blob \| blame \| history
lib/sqlalchemy/dialects/oracle/cx_oracle.py		patch \| blob \| blame \| history
lib/sqlalchemy/processors.py		patch \| blob \| blame \| history
lib/sqlalchemy/sql/sqltypes.py		patch \| blob \| blame \| history