- Added "unicode_errors" parameter to String, Unicode, etc.

author Mike Bayer <mike_mp@zzzcomputing.com>

Tue, 23 Feb 2010 19:53:07 +0000 (19:53 +0000)

committer Mike Bayer <mike_mp@zzzcomputing.com>

Tue, 23 Feb 2010 19:53:07 +0000 (19:53 +0000)
author Mike Bayer <mike_mp@zzzcomputing.com>
Tue, 23 Feb 2010 19:53:07 +0000 (19:53 +0000)
committer Mike Bayer <mike_mp@zzzcomputing.com>
Tue, 23 Feb 2010 19:53:07 +0000 (19:53 +0000)
diff --git a/CHANGES b/CHANGES

index 735b10e92e0b1483ae5114905b3e108a6513d75e..99ebc79aeb087bb2a472140ffd2179be28624755 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -99,7 +99,20 @@ CHANGES
    - SchemaType and subclasses Boolean, Enum are now serializable,
      including their ddl listener and other event callables.
      [ticket:1694] [ticket:1698]
-    
+
+  - Added "unicode_errors" parameter to String, Unicode, etc.
+    Behaves like the 'errors' keyword argument to
+    the standard library's string.decode() functions.   This flag
+    requires that `convert_unicode` is set to `"force"` - otherwise,
+    SQLAlchemy is not guaranteed to handle the task of unicode
+    conversion.   Note that this flag adds significant performance
+    overhead to row-fetching operations for backends that already
+    return unicode objects natively (which most DBAPIs do).  This
+    flag should only be used as an absolute last resort for reading
+    strings from a column with varied or corrupted encodings,
+    which only applies to databases that accept invalid encodings 
+    in the first place (i.e. MySQL. *not* PG, Sqlite, etc.)
+
    - Added math negation operator support, -x.
    
    - FunctionElement subclasses are now directly executable the
diff --git a/lib/sqlalchemy/processors.py b/lib/sqlalchemy/processors.py

index cb4b72545b68688cdee437c383166956913231a0..2bf9221b7ceaad8453aeb041c7c37d3f53987baa 100644 (file)
--- a/lib/sqlalchemy/processors.py
+++ b/lib/sqlalchemy/processors.py
@@ -32,14 +32,18 @@ try:
                                         str_to_datetime, str_to_time, \
                                         str_to_date
  
-    def to_unicode_processor_factory(encoding):
-        return UnicodeResultProcessor(encoding).process
+    def to_unicode_processor_factory(encoding, error=None):
+        # TODO: this is cumbersome
+        if error is not None:
+            return UnicodeResultProcessor(encoding, error).process
+        else:
+            return UnicodeResultProcessor(encoding).process
  
      def to_decimal_processor_factory(target_class):
          return DecimalResultProcessor(target_class).process
  
  except ImportError:
-    def to_unicode_processor_factory(encoding):
+    def to_unicode_processor_factory(encoding, error=None):
          decoder = codecs.getdecoder(encoding)
  
          def process(value):
@@ -50,7 +54,7 @@ except ImportError:
                  # len part is safe: it is done that way in the normal
                  # 'xx'.decode(encoding) code path.
                  # cfr python-source/Python/codecs.c:PyCodec_Decode
-                return decoder(value)[0]
+                return decoder(value, error)[0]
          return process
  
      def to_decimal_processor_factory(target_class):
diff --git a/lib/sqlalchemy/types.py b/lib/sqlalchemy/types.py

index 356edecd85ca746bf03d4c36365dc53067b0071f..e7a60e9a77d9baa46c8f4aee312a7d4f7e894afb 100644 (file)
--- a/lib/sqlalchemy/types.py
+++ b/lib/sqlalchemy/types.py
@@ -500,7 +500,7 @@ class String(Concatenable, TypeEngine):
  
      __visit_name__ = 'string'
  
-    def __init__(self, length=None, convert_unicode=False, assert_unicode=None):
+    def __init__(self, length=None, convert_unicode=False, assert_unicode=None, unicode_error=None):
          """
          Create a string-holding type.
  
@@ -530,7 +530,8 @@ class String(Concatenable, TypeEngine):
            rationale here is that isinstance() calls are enormously
            expensive at the level of column-fetching.  To
            force the check to occur regardless, set 
-          convert_unicode='force'.
+          convert_unicode='force'.   This will incur significant
+          performance overhead when fetching unicode result columns.
            
            Similarly, if the dialect is known to accept bind parameters
            as unicode objects, no translation from unicode to bytestring
@@ -547,10 +548,28 @@ class String(Concatenable, TypeEngine):
  
            If true, will raise an :exc:`sqlalchemy.exc.InvalidRequestError`.
  
+        :param unicode_error: Optional, a method to use to handle Unicode
+          conversion errors. Behaves like the 'errors' keyword argument to
+          the standard library's string.decode() functions.   This flag
+          requires that `convert_unicode` is set to `"force"` - otherwise,
+          SQLAlchemy is not guaranteed to handle the task of unicode
+          conversion.   Note that this flag adds significant performance
+          overhead to row-fetching operations for backends that already
+          return unicode objects natively (which most DBAPIs do).  This
+          flag should only be used as an absolute last resort for reading
+          strings from a column with varied or corrupted encodings,
+          which only applies to databases that accept invalid encodings 
+          in the first place (i.e. MySQL.  *not* PG, Sqlite, etc.)
+
          """
+        if unicode_error is not None and convert_unicode != 'force':
+            raise exc.ArgumentError("convert_unicode must be 'force' "
+                                        "when unicode_error is set.")
          self.length = length
          self.convert_unicode = convert_unicode
          self.assert_unicode = assert_unicode
+        self.unicode_error = unicode_error
+        
          
      def adapt(self, impltype):
          return impltype(
@@ -564,7 +583,7 @@ class String(Concatenable, TypeEngine):
                  assert_unicode = dialect.assert_unicode
              else:
                  assert_unicode = self.assert_unicode
-
+            
              if dialect.supports_unicode_binds and assert_unicode:
                  def process(value):
                      if value is None or isinstance(value, unicode):
@@ -584,7 +603,7 @@ class String(Concatenable, TypeEngine):
                  encoder = codecs.getencoder(dialect.encoding)
                  def process(value):
                      if isinstance(value, unicode):
-                        return encoder(value)[0]
+                        return encoder(value, self.unicode_error)[0]
                      elif assert_unicode and value is not None:
                          if assert_unicode == 'warn':
                              util.warn("Unicode type received non-unicode bind "
@@ -607,9 +626,23 @@ class String(Concatenable, TypeEngine):
                          self.convert_unicode == 'force')
          
          if needs_convert:
-            # note we *assume* that we do not have a unicode object
-            # here, instead of an expensive isinstance() check.
-            return processors.to_unicode_processor_factory(dialect.encoding)
+            to_unicode = processors.to_unicode_processor_factory(
+                                    dialect.encoding, self.unicode_error)
+            
+            if dialect.returns_unicode_strings:
+                # we wouldn't be here unless convert_unicode='force'
+                # was specified.   since we will be getting back unicode
+                # in most cases, we check for it (decode will fail).   
+                def process(value):
+                    if isinstance(value, unicode):
+                        return value
+                    else:
+                        return to_unicode(value)
+                return process
+            else:
+                # here, we assume that the object is not unicode,
+                # avoiding expensive isinstance() check.
+                return to_unicode
          else:
              return None
  
diff --git a/test/aaa_profiling/test_zoomark_orm.py b/test/aaa_profiling/test_zoomark_orm.py

index d427794ac82bf21e2de577754a96112eec15e4cc..fc3c1cba564d0b06f4f2512066f08900308043cd 100644 (file)
--- a/test/aaa_profiling/test_zoomark_orm.py
+++ b/test/aaa_profiling/test_zoomark_orm.py
@@ -299,11 +299,11 @@ class ZooMarkTest(TestBase):
      def test_profile_2_insert(self):
          self.test_baseline_2_insert()
  
-    @profiling.function_call_count(6783, {'2.6.4':7194})
+    @profiling.function_call_count(6783, {'2.6':7194})
      def test_profile_3_properties(self):
          self.test_baseline_3_properties()
  
-    @profiling.function_call_count(22510, {'2.6.4':24055})
+    @profiling.function_call_count(22510, {'2.6':24055})
      def test_profile_4_expressions(self):
          self.test_baseline_4_expressions()
  
diff --git a/test/sql/test_types.py b/test/sql/test_types.py

index cd1430f4cfa767e618e6f5dc5f271d5ff293192f..532a89d1d416d5f91e7ac2178dd258e57d95c0f8 100644 (file)
--- a/test/sql/test_types.py
+++ b/test/sql/test_types.py
@@ -54,7 +54,8 @@ class AdaptTest(TestBase):
                      if exp in compiled:
                          break
                  else:
-                    assert False, "%r matches none of %r for dialect %s" % (compiled, expected, dialect.name)
+                    assert False, "%r matches none of %r for dialect %s" % \
+                                            (compiled, expected, dialect.name)
              
  class TypeAffinityTest(TestBase):
      def test_type_affinity(self):
@@ -159,12 +160,14 @@ class UserDefinedTest(TestBase):
          class MyDecoratedType(types.TypeDecorator):
              impl = String
              def bind_processor(self, dialect):
-                impl_processor = super(MyDecoratedType, self).bind_processor(dialect) or (lambda value:value)
+                impl_processor = super(MyDecoratedType, self).bind_processor(dialect)\
+                                        or (lambda value:value)
                  def process(value):
                      return "BIND_IN"+ impl_processor(value)
                  return process
              def result_processor(self, dialect, coltype):
-                impl_processor = super(MyDecoratedType, self).result_processor(dialect, coltype) or (lambda value:value)
+                impl_processor = super(MyDecoratedType, self).result_processor(dialect, coltype)\
+                                        or (lambda value:value)
                  def process(value):
                      return impl_processor(value) + "BIND_OUT"
                  return process
@@ -206,14 +209,16 @@ class UserDefinedTest(TestBase):
              impl = Unicode
  
              def bind_processor(self, dialect):
-                impl_processor = super(MyUnicodeType, self).bind_processor(dialect) or (lambda value:value)
+                impl_processor = super(MyUnicodeType, self).bind_processor(dialect)\
+                                        or (lambda value:value)
  
                  def process(value):
                      return "BIND_IN"+ impl_processor(value)
                  return process
  
              def result_processor(self, dialect, coltype):
-                impl_processor = super(MyUnicodeType, self).result_processor(dialect, coltype) or (lambda value:value)
+                impl_processor = super(MyUnicodeType, self).result_processor(dialect, coltype)\
+                                        or (lambda value:value)
                  def process(value):
                      return impl_processor(value) + "BIND_OUT"
                  return process
@@ -342,7 +347,9 @@ class UnicodeTest(TestBase, AssertsExecutionResults):
          # cx_oracle was producing different behavior for cursor.executemany()
          # vs. cursor.execute()
          
-        unicodedata = u"Alors vous imaginez ma surprise, au lever du jour, quand une drôle de petit voix m’a réveillé. Elle disait: « S’il vous plaît… dessine-moi un mouton! »"
+        unicodedata = u"Alors vous imaginez ma surprise, au lever du jour, quand "\
+                        u"une drôle de petit voix m’a réveillé. "\
+                        u"Elle disait: « S’il vous plaît… dessine-moi un mouton! »"
  
          unicode_table.insert().execute(
                  dict(unicode_varchar=unicodedata,unicode_text=unicodedata),
@@ -358,12 +365,19 @@ class UnicodeTest(TestBase, AssertsExecutionResults):
      def test_union(self):
          """ensure compiler processing works for UNIONs"""
  
-        unicodedata = u"Alors vous imaginez ma surprise, au lever du jour, quand une drôle de petit voix m’a réveillé. Elle disait: « S’il vous plaît… dessine-moi un mouton! »"
+        unicodedata = u"Alors vous imaginez ma surprise, au lever du jour, quand "\
+                        u"une drôle de petit voix m’a réveillé. "\
+                        u"Elle disait: « S’il vous plaît… dessine-moi un mouton! »"
  
          unicode_table.insert().execute(unicode_varchar=unicodedata,unicode_text=unicodedata)
                                         
-        x = union(select([unicode_table.c.unicode_varchar]), select([unicode_table.c.unicode_varchar])).execute().first()
-        self.assert_(isinstance(x['unicode_varchar'], unicode) and x['unicode_varchar'] == unicodedata)
+        x = union(
+                    select([unicode_table.c.unicode_varchar]),
+                    select([unicode_table.c.unicode_varchar])
+                ).execute().first()
+        
+        assert isinstance(x['unicode_varchar'], unicode)
+        eq_(x['unicode_varchar'], unicodedata)
  
      @testing.fails_on('oracle', 'oracle converts empty strings to a blank space')
      def test_blank_strings(self):
@@ -373,7 +387,9 @@ class UnicodeTest(TestBase, AssertsExecutionResults):
      def test_parameters(self):
          """test the dialect convert_unicode parameters."""
  
-        unicodedata = u"Alors vous imaginez ma surprise, au lever du jour, quand une drôle de petit voix m’a réveillé. Elle disait: « S’il vous plaît… dessine-moi un mouton! »"
+        unicodedata = u"Alors vous imaginez ma surprise, au lever du jour, quand "\
+                        u"une drôle de petit voix m’a réveillé. "\
+                        u"Elle disait: « S’il vous plaît… dessine-moi un mouton! »"
  
          u = Unicode(assert_unicode=True)
          uni = u.dialect_impl(testing.db.dialect).bind_processor(testing.db.dialect)
@@ -404,7 +420,88 @@ class UnicodeTest(TestBase, AssertsExecutionResults):
          assert isinstance(uni(unicodedata), str)
          # end Py2K
          
-        assert uni(unicodedata) == unicodedata.encode('utf-8')
+        eq_(uni(unicodedata), unicodedata.encode('utf-8'))
+
+    def test_ignoring_unicode_error(self):
+        """checks String(unicode_error='ignore') is passed to underlying codec."""
+        
+        unicodedata = u"Alors vous imaginez ma surprise, au lever du jour, quand "\
+                        u"une drôle de petit voix m’a réveillé. "\
+                        u"Elle disait: « S’il vous plaît… dessine-moi un mouton! »"
+        
+        asciidata = unicodedata.encode('ascii', 'ignore')
+        
+        m = MetaData()
+        table = Table('unicode_err_table', m,
+            Column('sort', Integer),
+            Column('plain_varchar_no_coding_error', \
+                    String(248, convert_unicode='force', unicode_error='ignore'))
+            )
+        
+        m2 = MetaData()
+        utf8_table = Table('unicode_err_table', m2,
+            Column('sort', Integer),
+            Column('plain_varchar_no_coding_error', \
+                    String(248, convert_unicode=True))
+            )
+        
+        engine = engines.testing_engine(options={'encoding':'ascii'})
+        m.create_all(engine)
+        try:
+            # insert a row that should be ascii and 
+            # coerce from unicode with ignore on the bind side
+            engine.execute(
+                table.insert(),
+                sort=1,
+                plain_varchar_no_coding_error=unicodedata
+            )
+
+            # switch to utf-8
+            engine.dialect.encoding = 'utf-8'
+            from binascii import hexlify
+            
+            # the row that we put in was stored as hexlified ascii
+            row = engine.execute(utf8_table.select()).first()
+            x = row['plain_varchar_no_coding_error']
+            a = hexlify(x)
+            b = hexlify(asciidata)
+            eq_(a, b)
+            
+            # insert another row which will be stored with
+            # utf-8 only chars
+            engine.execute(
+                utf8_table.insert(),
+                sort=2,
+                plain_varchar_no_coding_error=unicodedata
+            )
+
+            # switch back to ascii
+            engine.dialect.encoding = 'ascii'
+
+            # one row will be ascii with ignores,
+            # the other will be either ascii with the ignores
+            # or just the straight unicode+ utf8 value if the 
+            # dialect just returns unicode
+            result = engine.execute(table.select().order_by(table.c.sort))
+            ascii_row = result.fetchone()
+            utf8_row = result.fetchone()
+            result.close()
+            
+            x = ascii_row['plain_varchar_no_coding_error']
+            a = hexlify(x)
+            b = hexlify(asciidata)
+            eq_(a, b)
+
+            x = utf8_row['plain_varchar_no_coding_error']
+            if engine.dialect.returns_unicode_strings:
+                eq_(x, unicodedata)
+            else:
+                a = hexlify(x)
+                eq_(a, b)
+                
+        finally:
+            m.drop_all(engine)
+
  
  class EnumTest(TestBase):
      @classmethod
@@ -477,7 +574,8 @@ class EnumTest(TestBase):
              {'id':4, 'someenum':'four'}
          )
  
-    @testing.fails_on('mysql', "the CHECK constraint doesn't raise an exception for unknown reason")
+    @testing.fails_on('mysql', 
+                    "the CHECK constraint doesn't raise an exception for unknown reason")
      def test_non_native_constraint(self):
          assert_raises(exc.DBAPIError, 
              non_native_enum_table.insert().execute,
@@ -507,13 +605,10 @@ class BinaryTest(TestBase, AssertsExecutionResults):
                  return value
  
          binary_table = Table('binary_table', MetaData(testing.db),
-            Column('primary_id', Integer, Sequence('binary_id_seq', optional=True), primary_key=True),
+            Column('primary_id', Integer, primary_key=True, test_needs_autoincrement=True),
              Column('data', LargeBinary),
              Column('data_slice', LargeBinary(100)),
              Column('misc', String(30)),
-            # construct PickleType with non-native pickle module, since cPickle uses relative module
-            # loading and confuses this test's parent package 'sql' with the 'sqlalchemy.sql' package relative
-            # to the 'types' module
              Column('pickled', PickleType),
              Column('mypickle', MyPickleType)
          )
@@ -558,7 +653,9 @@ class BinaryTest(TestBase, AssertsExecutionResults):
              binary_table.select(order_by=binary_table.c.primary_id),
              text(
                  "select * from binary_table order by binary_table.primary_id", 
-                typemap={'pickled':PickleType, 'mypickle':MyPickleType, 'data':LargeBinary, 'data_slice':LargeBinary}, 
+                typemap={'pickled':PickleType, 
+                        'mypickle':MyPickleType, 
+                        'data':LargeBinary, 'data_slice':LargeBinary}, 
                  bind=testing.db)
          ):
              l = stmt.execute().fetchall()
@@ -602,7 +699,11 @@ class ExpressionTest(TestBase, AssertsExecutionResults):
  
          meta.create_all()
  
-        test_table.insert().execute({'id':1, 'data':'somedata', 'atimestamp':datetime.date(2007, 10, 15), 'avalue':25})
+        test_table.insert().execute({
+                                        'id':1, 
+                                        'data':'somedata', 
+                                        'atimestamp':datetime.date(2007, 10, 15), 
+                                        'avalue':25})
  
      @classmethod
      def teardown_class(cls):
@@ -620,11 +721,20 @@ class ExpressionTest(TestBase, AssertsExecutionResults):
          expr = test_table.c.atimestamp == bindparam("thedate")
          assert expr.right.type.__class__ == test_table.c.atimestamp.type.__class__
  
-        assert testing.db.execute(test_table.select().where(expr), {"thedate":datetime.date(2007, 10, 15)}).fetchall() == [(1, 'somedata', datetime.date(2007, 10, 15), 25)]
+        eq_(
+            testing.db.execute(
+                            test_table.select().where(expr), 
+                            {"thedate":datetime.date(2007, 10, 15)}).fetchall(),
+            [(1, 'somedata', datetime.date(2007, 10, 15), 25)]
+        )
  
          expr = test_table.c.avalue == bindparam("somevalue")
-        assert expr.right.type.__class__ == test_table.c.avalue.type.__class__
-        assert testing.db.execute(test_table.select().where(expr), {"somevalue":25}).fetchall() == [(1, 'somedata', datetime.date(2007, 10, 15), 25)]
+        eq_(expr.right.type.__class__, test_table.c.avalue.type.__class__)
+        
+        eq_(
+            testing.db.execute(test_table.select().where(expr), {"somevalue":25}).fetchall(),
+            [(1, 'somedata', datetime.date(2007, 10, 15), 25)]
+        )
  
      @testing.fails_on('firebird', 'Data type unknown on the parameter')
      def test_operator_adapt(self):
@@ -1013,7 +1123,6 @@ class BooleanTest(TestBase, AssertsExecutionResults):
          testing.db.execute(
              "insert into booltest (id, unconstrained_value) values (1, 5)")
      
-        
  class PickleTest(TestBase):
      def test_eq_comparison(self):
          p1 = PickleType()
@@ -1025,7 +1134,10 @@ class PickleTest(TestBase):
          ):
              assert p1.compare_values(p1.copy_value(obj), obj)
  
-        assert_raises(NotImplementedError, p1.compare_values, pickleable.BrokenComparable('foo'),pickleable.BrokenComparable('foo'))
+        assert_raises(NotImplementedError, 
+                        p1.compare_values,
+                        pickleable.BrokenComparable('foo'),
+                        pickleable.BrokenComparable('foo'))
          
      def test_nonmutable_comparison(self):
          p1 = PickleType()
author	Mike Bayer <mike_mp@zzzcomputing.com>
	Tue, 23 Feb 2010 19:53:07 +0000 (19:53 +0000)
committer	Mike Bayer <mike_mp@zzzcomputing.com>
	Tue, 23 Feb 2010 19:53:07 +0000 (19:53 +0000)
CHANGES		patch \| blob \| blame \| history
lib/sqlalchemy/processors.py		patch \| blob \| blame \| history
lib/sqlalchemy/types.py		patch \| blob \| blame \| history
test/aaa_profiling/test_zoomark_orm.py		patch \| blob \| blame \| history
test/sql/test_types.py		patch \| blob \| blame \| history