From: suraj Date: Mon, 11 Aug 2025 12:21:46 +0000 (-0400) Subject: Fixes: #12711 Added sparse vector support in Oracle X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d8e6654427b5d24474e4d1825208991e5fcb5b43;p=thirdparty%2Fsqlalchemy%2Fsqlalchemy.git Fixes: #12711 Added sparse vector support in Oracle Extended :class:`_oracle.VECTOR` to support sparse vectors. This update introduces :class:_oracle.VectorStorageType to specify sparse or dense storage and added :class:`_oracle.SparseVector`. Pull request courtesy Suraj Shaw. Fixes: #12711 Closes: #12712 Pull-request: https://github.com/sqlalchemy/sqlalchemy/pull/12712 Pull-request-sha: 5a4199de1e89785129ee6fce4c7e65570419a1c7 Change-Id: Icdda9520a5f752e923f087edb166b4032f5bfd21 --- diff --git a/doc/build/changelog/unreleased_20/12711.rst b/doc/build/changelog/unreleased_20/12711.rst new file mode 100644 index 0000000000..f39a3b1ed1 --- /dev/null +++ b/doc/build/changelog/unreleased_20/12711.rst @@ -0,0 +1,8 @@ +.. change:: + :tags: usecase, oracle + :tickets: 12711 + + Extended :class:`_oracle.VECTOR` to support sparse vectors. This update + introduces :class:_oracle.VectorStorageType to specify sparse or dense + storage and added :class:`_oracle.SparseVector`. Pull request courtesy + Suraj Shaw. diff --git a/doc/build/dialects/oracle.rst b/doc/build/dialects/oracle.rst index b9e9a1d087..fc19a81fa4 100644 --- a/doc/build/dialects/oracle.rst +++ b/doc/build/dialects/oracle.rst @@ -94,6 +94,12 @@ construction arguments, are as follows: .. autoclass:: VectorDistanceType :members: +.. autoclass:: VectorStorageType + :members: + +.. autoclass:: SparseVector + :members: + .. _oracledb: diff --git a/lib/sqlalchemy/dialects/oracle/__init__.py b/lib/sqlalchemy/dialects/oracle/__init__.py index 2265de033c..566edf1c3b 100644 --- a/lib/sqlalchemy/dialects/oracle/__init__.py +++ b/lib/sqlalchemy/dialects/oracle/__init__.py @@ -35,8 +35,10 @@ from .base import VARCHAR2 from .base import VECTOR from .base import VectorIndexConfig from .base import VectorIndexType +from .vector import SparseVector from .vector import VectorDistanceType from .vector import VectorStorageFormat +from .vector import VectorStorageType # Alias oracledb also as oracledb_async oracledb_async = type( @@ -74,4 +76,6 @@ __all__ = ( "VectorIndexType", "VectorIndexConfig", "VectorStorageFormat", + "VectorStorageType", + "SparseVector", ) diff --git a/lib/sqlalchemy/dialects/oracle/base.py b/lib/sqlalchemy/dialects/oracle/base.py index f24f4f54b0..83f562eba5 100644 --- a/lib/sqlalchemy/dialects/oracle/base.py +++ b/lib/sqlalchemy/dialects/oracle/base.py @@ -737,8 +737,22 @@ VECTOR Datatype Oracle Database 23ai introduced a new VECTOR datatype for artificial intelligence and machine learning search operations. The VECTOR datatype is a homogeneous array -of 8-bit signed integers, 8-bit unsigned integers (binary), 32-bit floating-point numbers, -or 64-bit floating-point numbers. +of 8-bit signed integers, 8-bit unsigned integers (binary), 32-bit floating-point +numbers, or 64-bit floating-point numbers. + +A vector's storage type can be either DENSE or SPARSE. A dense vector contains +meaningful values in most or all of its dimensions. In contrast, a sparse vector +has non-zero values in only a few dimensions, with the majority being zero. + +Sparse vectors are represented by the total number of vector dimensions, an array +of indices, and an array of values where each value’s location in the vector is +indicated by the corresponding indices array position. All other vector values are +treated as zero. + +The storage formats that can be used with sparse vectors are float32, float64, and +int8. Note that the binary storage format cannot be used with sparse vectors. + +Sparse vectors are supported when you are using Oracle Database 23.7 or later. .. seealso:: @@ -746,17 +760,26 @@ or 64-bit floating-point numbers. `_ - in the documentation for the :ref:`oracledb` driver. -.. versionadded:: 2.0.41 +.. versionadded:: 2.0.41 - Added VECTOR datatype + +.. versionadded:: 2.0.43 - Added DENSE/SPARSE support CREATE TABLE support for VECTOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -With the :class:`.VECTOR` datatype, you can specify the dimension for the data -and the storage format. Valid values for storage format are enum values from -:class:`.VectorStorageFormat`. To create a table that includes a -:class:`.VECTOR` column:: +With the :class:`.VECTOR` datatype, you can specify the number of dimensions, +the storage format, and the storage type for the data. Valid values for the +storage format are enum members of :class:`.VectorStorageFormat`. Valid values +for the storage type are enum members of :class:`.VectorStorageType`. If +storage type is not specified, a DENSE vector is created by default. + +To create a table that includes a :class:`.VECTOR` column:: - from sqlalchemy.dialects.oracle import VECTOR, VectorStorageFormat + from sqlalchemy.dialects.oracle import ( + VECTOR, + VectorStorageFormat, + VectorStorageType, + ) t = Table( "t1", @@ -764,7 +787,11 @@ and the storage format. Valid values for storage format are enum values from Column("id", Integer, primary_key=True), Column( "embedding", - VECTOR(dim=3, storage_format=VectorStorageFormat.FLOAT32), + VECTOR( + dim=3, + storage_format=VectorStorageFormat.FLOAT32, + storage_type=VectorStorageType.SPARSE, + ), ), Column(...), ..., @@ -772,31 +799,40 @@ and the storage format. Valid values for storage format are enum values from Vectors can also be defined with an arbitrary number of dimensions and formats. This allows you to specify vectors of different dimensions with the various -storage formats mentioned above. +storage formats mentioned below. **Examples** -* In this case, the storage format is flexible, allowing any vector type data to be inserted, - such as INT8 or BINARY etc:: +* In this case, the storage format is flexible, allowing any vector type data to be + inserted, such as INT8 or BINARY etc:: vector_col: Mapped[array.array] = mapped_column(VECTOR(dim=3)) -* The dimension is flexible in this case, meaning that any dimension vector can be used:: +* The dimension is flexible in this case, meaning that any dimension vector can + be used:: vector_col: Mapped[array.array] = mapped_column( VECTOR(storage_format=VectorStorageType.INT8) ) -* Both the dimensions and the storage format are flexible:: +* Both the dimensions and the storage format are flexible. It creates a DENSE vector:: vector_col: Mapped[array.array] = mapped_column(VECTOR) +* To create a SPARSE vector with both dimensions and the storage format as flexible, + use the :attr:`.VectorStorageType.SPARSE` storage type:: + + vector_col: Mapped[array.array] = mapped_column( + VECTOR(storage_type=VectorStorageType.SPARSE) + ) + Python Datatypes for VECTOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~ VECTOR data can be inserted using Python list or Python ``array.array()`` objects. -Python arrays of type FLOAT (32-bit), DOUBLE (64-bit), or INT (8-bit signed integer) -are used as bind values when inserting VECTOR columns:: +Python arrays of type FLOAT (32-bit), DOUBLE (64-bit), INT (8-bit signed integers), +or BINARY (8-bit unsigned integers) are used as bind values when inserting +VECTOR columns:: from sqlalchemy import insert, select @@ -806,6 +842,21 @@ are used as bind values when inserting VECTOR columns:: {"id": 1, "embedding": [1, 2, 3]}, ) +Data can be inserted into a sparse vector using the :class:`_oracle.SparseVector` +class, creating an object consisting of the number of dimensions, an array of indices, and a +corresponding array of values:: + + from sqlalchemy import insert, select + from sqlalchemy.dialects.oracle import SparseVector + + sparse_val = SparseVector(10, [1, 2], array.array("d", [23.45, 221.22])) + + with engine.begin() as conn: + conn.execute( + insert(t1), + {"id": 1, "embedding": sparse_val}, + ) + VECTOR Indexes ~~~~~~~~~~~~~~ @@ -813,6 +864,8 @@ The VECTOR feature supports an Oracle-specific parameter ``oracle_vector`` on the :class:`.Index` construct, which allows the construction of VECTOR indexes. +SPARSE vectors cannot be used in the creation of vector indexes. + To utilize VECTOR indexing, set the ``oracle_vector`` parameter to True to use the default values provided by Oracle. HNSW is the default indexing method:: @@ -1165,14 +1218,16 @@ class OracleTypeCompiler(compiler.GenericTypeCompiler): return "ROWID" def visit_VECTOR(self, type_, **kw): - if type_.dim is None and type_.storage_format is None: - return "VECTOR(*,*)" - elif type_.storage_format is None: - return f"VECTOR({type_.dim},*)" - elif type_.dim is None: - return f"VECTOR(*,{type_.storage_format.value})" - else: - return f"VECTOR({type_.dim},{type_.storage_format.value})" + dim = type_.dim if type_.dim is not None else "*" + storage_format = ( + type_.storage_format.value + if type_.storage_format is not None + else "*" + ) + storage_type = ( + type_.storage_type.value if type_.storage_type is not None else "*" + ) + return f"VECTOR({dim},{storage_format},{storage_type})" class OracleCompiler(compiler.SQLCompiler): diff --git a/lib/sqlalchemy/dialects/oracle/vector.py b/lib/sqlalchemy/dialects/oracle/vector.py index dae89d3418..88d47ea1d1 100644 --- a/lib/sqlalchemy/dialects/oracle/vector.py +++ b/lib/sqlalchemy/dialects/oracle/vector.py @@ -13,6 +13,7 @@ import array from dataclasses import dataclass from enum import Enum from typing import Optional +from typing import Union import sqlalchemy.types as types from sqlalchemy.types import Float @@ -95,6 +96,27 @@ class VectorStorageFormat(Enum): """ +class VectorStorageType(Enum): + """Enum representing the vector type, + + See :ref:`oracle_vector_datatype` for background. + + .. versionadded:: 2.0.43 + + """ + + SPARSE = "SPARSE" + """ + A Sparse vector is a vector which has zero value for + most of its dimensions. + """ + DENSE = "DENSE" + """ + A Dense vector is a vector where most, if not all, elements + hold meaningful values. + """ + + @dataclass class VectorIndexConfig: """Define the configuration for Oracle VECTOR Index. @@ -176,6 +198,39 @@ class VectorIndexConfig: ) +class SparseVector: + """ + Lightweight SQLAlchemy-side version of SparseVector. + This mimics oracledb.SparseVector. + + .. versionadded:: 2.0.43 + + """ + + def __init__( + self, + num_dimensions: int, + indices: Union[list, array.array], + values: Union[list, array.array], + ): + if not isinstance(indices, array.array) or indices.typecode != "I": + indices = array.array("I", indices) + if not isinstance(values, array.array): + values = array.array("d", values) + if len(indices) != len(values): + raise TypeError("indices and values must be of the same length!") + + self.num_dimensions = num_dimensions + self.indices = indices + self.values = values + + def __str__(self): + return ( + f"SparseVector(num_dimensions={self.num_dimensions}, " + f"size={len(self.indices)}, typecode={self.values.typecode})" + ) + + class VECTOR(types.TypeEngine): """Oracle VECTOR datatype. @@ -196,17 +251,22 @@ class VECTOR(types.TypeEngine): VectorStorageFormat.FLOAT64: "d", # Double } - def __init__(self, dim=None, storage_format=None): + def __init__(self, dim=None, storage_format=None, storage_type=None): """Construct a VECTOR. :param dim: integer. The dimension of the VECTOR datatype. This should be an integer value. :param storage_format: VectorStorageFormat. The VECTOR storage - type format. This may be Enum values form + type format. This should be Enum values form :class:`.VectorStorageFormat` INT8, BINARY, FLOAT32, or FLOAT64. + :param storage_type: VectorStorageType. The Vector storage type. This + should be Enum values from :class:`.VectorStorageType` SPARSE or + DENSE. + """ + if dim is not None and not isinstance(dim, int): raise TypeError("dim must be an interger") if storage_format is not None and not isinstance( @@ -215,12 +275,22 @@ class VECTOR(types.TypeEngine): raise TypeError( "storage_format must be an enum of type VectorStorageFormat" ) + if storage_type is not None and not isinstance( + storage_type, VectorStorageType + ): + raise TypeError( + "storage_type must be an enum of type VectorStorageType" + ) + self.dim = dim self.storage_format = storage_format + self.storage_type = storage_type def _cached_bind_processor(self, dialect): """ - Convert a list to a array.array before binding it to the database. + Converts a Python-side SparseVector instance into an + oracledb.SparseVectormor a compatible array format before + binding it to the database. """ def process(value): @@ -233,20 +303,48 @@ class VECTOR(types.TypeEngine): value = array.array(typecode, value) return value + # Convert SqlAlchemy SparseVector to oracledb SparseVector object + elif isinstance(value, SparseVector): + return dialect.dbapi.SparseVector( + value.num_dimensions, + value.indices, + value.values, + ) + else: - raise TypeError("VECTOR accepts list or array.array()") + raise TypeError( + """ + Invalid input for VECTOR: expected a list, an array.array, + or a SparseVector object. + """ + ) return process def _cached_result_processor(self, dialect, coltype): """ - Convert a array.array to list before binding it to the database. + Converts database-returned values into Python-native representations. + If the value is an oracledb.SparseVector, it is converted into the + SQLAlchemy-side SparseVector class. + If the value is a array.array, it is converted to a plain Python list. + """ def process(value): - if isinstance(value, array.array): + if value is None: + return None + + elif isinstance(value, array.array): return list(value) + # Convert Oracledb SparseVector to SqlAlchemy SparseVector object + elif isinstance(value, dialect.dbapi.SparseVector): + return SparseVector( + num_dimensions=value.num_dimensions, + indices=value.indices, + values=value.values, + ) + return process def _array_typecode(self, typecode): diff --git a/test/dialect/oracle/test_types.py b/test/dialect/oracle/test_types.py index dc060f27e0..c905b921b9 100644 --- a/test/dialect/oracle/test_types.py +++ b/test/dialect/oracle/test_types.py @@ -39,11 +39,13 @@ from sqlalchemy import VARCHAR from sqlalchemy.dialects.oracle import base as oracle from sqlalchemy.dialects.oracle import cx_oracle from sqlalchemy.dialects.oracle import oracledb +from sqlalchemy.dialects.oracle import SparseVector from sqlalchemy.dialects.oracle import VECTOR from sqlalchemy.dialects.oracle import VectorDistanceType from sqlalchemy.dialects.oracle import VectorIndexConfig from sqlalchemy.dialects.oracle import VectorIndexType from sqlalchemy.dialects.oracle import VectorStorageFormat +from sqlalchemy.dialects.oracle import VectorStorageType from sqlalchemy.sql import column from sqlalchemy.sql.sqltypes import NullType from sqlalchemy.testing import AssertsCompiledSQL @@ -1146,6 +1148,51 @@ class TypesTest(fixtures.TestBase): ).first() eq_(res.embedding, [1, 2, 3]) + @testing.only_on("oracle>=23.7") + def test_sparse_vector(self, metadata, connection): + t1 = Table( + "t1", + metadata, + Column("id", Integer), + Column( + "embedding", + VECTOR( + dim=3, + storage_format=VectorStorageFormat.INT8, + storage_type=VectorStorageType.SPARSE, + ), + ), + ) + t1.create(connection) + eq_(t1.c.embedding.type.storage_type, VectorStorageType.SPARSE) + + @testing.only_on("oracle>=23.7") + def test_sparse_vector_insert(self, metadata, connection): + t1 = Table( + "t1", + metadata, + Column("id", Integer), + Column( + "embedding", + VECTOR( + dim=10, + storage_format=VectorStorageFormat.FLOAT32, + storage_type=VectorStorageType.SPARSE, + ), + ), + ) + t1.create(connection) + sparse_vector = SparseVector( + 10, [1, 2], array.array("f", [23.25, 221.625]) + ) + connection.execute(t1.insert(), dict(id=1, embedding=sparse_vector)) + result = connection.execute(t1.select()).first() + eq_(result[0], 1) + eq_(isinstance(result[1], SparseVector), True) + eq_(result[1].num_dimensions, 10) + eq_(result[1].indices, array.array("I", [1, 2])) + eq_(result[1].values, array.array("f", [23.25, 221.625])) + class LOBFetchTest(fixtures.TablesTest): __only_on__ = "oracle"