Fixes: #12711 Added sparse vector support in Oracle

author suraj <suraj.shaw@oracle.com>

Mon, 11 Aug 2025 12:21:46 +0000 (08:21 -0400)

committer Mike Bayer <mike_mp@zzzcomputing.com>

Mon, 11 Aug 2025 12:34:49 +0000 (08:34 -0400)
author suraj <suraj.shaw@oracle.com>
Mon, 11 Aug 2025 12:21:46 +0000 (08:21 -0400)
committer Mike Bayer <mike_mp@zzzcomputing.com>
Mon, 11 Aug 2025 12:34:49 +0000 (08:34 -0400)
diff --git a/doc/build/changelog/unreleased_20/12711.rst b/doc/build/changelog/unreleased_20/12711.rst

new file mode 100644 (file)

index 0000000..f39a3b1
--- /dev/null
+++ b/doc/build/changelog/unreleased_20/12711.rst
@@ -0,0 +1,8 @@
+.. change::
+    :tags: usecase, oracle
+    :tickets: 12711
+
+    Extended :class:`_oracle.VECTOR` to support sparse vectors. This update
+    introduces :class:_oracle.VectorStorageType to specify sparse or dense
+    storage and added :class:`_oracle.SparseVector`. Pull request courtesy
+    Suraj Shaw.
diff --git a/doc/build/dialects/oracle.rst b/doc/build/dialects/oracle.rst

index b9e9a1d087030f5c229f81b6c997f3cc74f1b7e8..fc19a81fa4b8ade1541f006315fe8e0631a6410a 100644 (file)
--- a/doc/build/dialects/oracle.rst
+++ b/doc/build/dialects/oracle.rst
@@ -94,6 +94,12 @@ construction arguments, are as follows:
  .. autoclass:: VectorDistanceType
    :members:
  
+.. autoclass:: VectorStorageType
+  :members:
+
+.. autoclass:: SparseVector
+  :members:
+
  
  .. _oracledb:
  
diff --git a/lib/sqlalchemy/dialects/oracle/__init__.py b/lib/sqlalchemy/dialects/oracle/__init__.py

index 2265de033c93236c166f3e91c3dcb00770d5d7d9..566edf1c3b66085af208e47706b62847e0414da9 100644 (file)
--- a/lib/sqlalchemy/dialects/oracle/__init__.py
+++ b/lib/sqlalchemy/dialects/oracle/__init__.py
@@ -35,8 +35,10 @@ from .base import VARCHAR2
  from .base import VECTOR
  from .base import VectorIndexConfig
  from .base import VectorIndexType
+from .vector import SparseVector
  from .vector import VectorDistanceType
  from .vector import VectorStorageFormat
+from .vector import VectorStorageType
  
  # Alias oracledb also as oracledb_async
  oracledb_async = type(
@@ -74,4 +76,6 @@ __all__ = (
      "VectorIndexType",
      "VectorIndexConfig",
      "VectorStorageFormat",
+    "VectorStorageType",
+    "SparseVector",
  )
diff --git a/lib/sqlalchemy/dialects/oracle/base.py b/lib/sqlalchemy/dialects/oracle/base.py

index f24f4f54b0db0ae4cfcadb9eee3e902761f04f24..83f562eba5cb3d7b6a0a87c33b7643288a228cb3 100644 (file)
--- a/lib/sqlalchemy/dialects/oracle/base.py
+++ b/lib/sqlalchemy/dialects/oracle/base.py
@@ -737,8 +737,22 @@ VECTOR Datatype
  
  Oracle Database 23ai introduced a new VECTOR datatype for artificial intelligence
  and machine learning search operations. The VECTOR datatype is a homogeneous array
-of 8-bit signed integers, 8-bit unsigned integers (binary), 32-bit floating-point numbers,
-or 64-bit floating-point numbers.
+of 8-bit signed integers, 8-bit unsigned integers (binary), 32-bit floating-point
+numbers, or 64-bit floating-point numbers.
+
+A vector's storage type can be either DENSE or SPARSE. A dense vector contains
+meaningful values in most or all of its dimensions. In contrast, a sparse vector
+has non-zero values in only a few dimensions, with the majority being zero.
+
+Sparse vectors are represented by the total number of vector dimensions, an array
+of indices, and an array of values where each value’s location in the vector is
+indicated by the corresponding indices array position. All other vector values are
+treated as zero.
+
+The storage formats that can be used with sparse vectors are float32, float64, and
+int8. Note that the binary storage format cannot be used with sparse vectors.
+
+Sparse vectors are supported when you are using Oracle Database 23.7 or later.
  
  .. seealso::
  
@@ -746,17 +760,26 @@ or 64-bit floating-point numbers.
      <https://python-oracledb.readthedocs.io/en/latest/user_guide/vector_data_type.html>`_ - in the documentation
      for the :ref:`oracledb` driver.
  
-.. versionadded:: 2.0.41
+.. versionadded:: 2.0.41 - Added VECTOR datatype
+
+.. versionadded:: 2.0.43 - Added DENSE/SPARSE support
  
  CREATE TABLE support for VECTOR
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  
-With the :class:`.VECTOR` datatype, you can specify the dimension for the data
-and the storage format. Valid values for storage format are enum values from
-:class:`.VectorStorageFormat`. To create a table that includes a
-:class:`.VECTOR` column::
+With the :class:`.VECTOR` datatype, you can specify the number of dimensions,
+the storage format, and the storage type for the data. Valid values for the
+storage format are enum members of :class:`.VectorStorageFormat`. Valid values
+for the storage type are enum members of :class:`.VectorStorageType`. If
+storage type is not specified, a DENSE vector is created by default.
+
+To create a table that includes a :class:`.VECTOR` column::
  
-    from sqlalchemy.dialects.oracle import VECTOR, VectorStorageFormat
+    from sqlalchemy.dialects.oracle import (
+        VECTOR,
+        VectorStorageFormat,
+        VectorStorageType,
+    )
  
      t = Table(
          "t1",
@@ -764,7 +787,11 @@ and the storage format. Valid values for storage format are enum values from
          Column("id", Integer, primary_key=True),
          Column(
              "embedding",
-            VECTOR(dim=3, storage_format=VectorStorageFormat.FLOAT32),
+            VECTOR(
+                dim=3,
+                storage_format=VectorStorageFormat.FLOAT32,
+                storage_type=VectorStorageType.SPARSE,
+            ),
          ),
          Column(...),
          ...,
@@ -772,31 +799,40 @@ and the storage format. Valid values for storage format are enum values from
  
  Vectors can also be defined with an arbitrary number of dimensions and formats.
  This allows you to specify vectors of different dimensions with the various
-storage formats mentioned above.
+storage formats mentioned below.
  
  **Examples**
  
-* In this case, the storage format is flexible, allowing any vector type data to be inserted,
-  such as INT8 or BINARY etc::
+* In this case, the storage format is flexible, allowing any vector type data to be
+  inserted, such as INT8 or BINARY etc::
  
      vector_col: Mapped[array.array] = mapped_column(VECTOR(dim=3))
  
-* The dimension is flexible in this case, meaning that any dimension vector can be used::
+* The dimension is flexible in this case, meaning that any dimension vector can
+  be used::
  
      vector_col: Mapped[array.array] = mapped_column(
          VECTOR(storage_format=VectorStorageType.INT8)
      )
  
-* Both the dimensions and the storage format are flexible::
+* Both the dimensions and the storage format are flexible. It creates a DENSE vector::
  
      vector_col: Mapped[array.array] = mapped_column(VECTOR)
  
+* To create a SPARSE vector with both dimensions and the storage format as flexible,
+  use the :attr:`.VectorStorageType.SPARSE` storage type::
+
+    vector_col: Mapped[array.array] = mapped_column(
+        VECTOR(storage_type=VectorStorageType.SPARSE)
+    )
+
  Python Datatypes for VECTOR
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~
  
  VECTOR data can be inserted using Python list or Python ``array.array()`` objects.
-Python arrays of type FLOAT (32-bit), DOUBLE (64-bit), or INT (8-bit signed integer)
-are used as bind values when inserting VECTOR columns::
+Python arrays of type FLOAT (32-bit), DOUBLE (64-bit), INT (8-bit signed integers),
+or BINARY (8-bit unsigned integers) are used as bind values when inserting
+VECTOR columns::
  
      from sqlalchemy import insert, select
  
@@ -806,6 +842,21 @@ are used as bind values when inserting VECTOR columns::
              {"id": 1, "embedding": [1, 2, 3]},
          )
  
+Data can be inserted into a sparse vector using the :class:`_oracle.SparseVector`
+class, creating an object consisting of the number of dimensions, an array of indices, and a
+corresponding array of values::
+
+    from sqlalchemy import insert, select
+    from sqlalchemy.dialects.oracle import SparseVector
+
+    sparse_val = SparseVector(10, [1, 2], array.array("d", [23.45, 221.22]))
+
+    with engine.begin() as conn:
+        conn.execute(
+            insert(t1),
+            {"id": 1, "embedding": sparse_val},
+        )
+
  VECTOR Indexes
  ~~~~~~~~~~~~~~
  
@@ -813,6 +864,8 @@ The VECTOR feature supports an Oracle-specific parameter ``oracle_vector``
  on the :class:`.Index` construct, which allows the construction of VECTOR
  indexes.
  
+SPARSE vectors cannot be used in the creation of vector indexes.
+
  To utilize VECTOR indexing, set the ``oracle_vector`` parameter to True to use
  the default values provided by Oracle. HNSW is the default indexing method::
  
@@ -1165,14 +1218,16 @@ class OracleTypeCompiler(compiler.GenericTypeCompiler):
          return "ROWID"
  
      def visit_VECTOR(self, type_, **kw):
-        if type_.dim is None and type_.storage_format is None:
-            return "VECTOR(*,*)"
-        elif type_.storage_format is None:
-            return f"VECTOR({type_.dim},*)"
-        elif type_.dim is None:
-            return f"VECTOR(*,{type_.storage_format.value})"
-        else:
-            return f"VECTOR({type_.dim},{type_.storage_format.value})"
+        dim = type_.dim if type_.dim is not None else "*"
+        storage_format = (
+            type_.storage_format.value
+            if type_.storage_format is not None
+            else "*"
+        )
+        storage_type = (
+            type_.storage_type.value if type_.storage_type is not None else "*"
+        )
+        return f"VECTOR({dim},{storage_format},{storage_type})"
  
  
  class OracleCompiler(compiler.SQLCompiler):
diff --git a/lib/sqlalchemy/dialects/oracle/vector.py b/lib/sqlalchemy/dialects/oracle/vector.py

index dae89d3418d2c965e233fae971ad5bdcebdc0f46..88d47ea1d1017037b589b8f43743beb30b35b40f 100644 (file)
--- a/lib/sqlalchemy/dialects/oracle/vector.py
+++ b/lib/sqlalchemy/dialects/oracle/vector.py
@@ -13,6 +13,7 @@ import array
  from dataclasses import dataclass
  from enum import Enum
  from typing import Optional
+from typing import Union
  
  import sqlalchemy.types as types
  from sqlalchemy.types import Float
@@ -95,6 +96,27 @@ class VectorStorageFormat(Enum):
      """
  
  
+class VectorStorageType(Enum):
+    """Enum representing the vector type,
+
+    See :ref:`oracle_vector_datatype` for background.
+
+    .. versionadded:: 2.0.43
+
+    """
+
+    SPARSE = "SPARSE"
+    """
+    A Sparse vector is a vector which has zero value for
+    most of its dimensions.
+    """
+    DENSE = "DENSE"
+    """
+    A Dense vector is a vector where most, if not all, elements
+    hold meaningful values.
+    """
+
+
  @dataclass
  class VectorIndexConfig:
      """Define the configuration for Oracle VECTOR Index.
@@ -176,6 +198,39 @@ class VectorIndexConfig:
                  )
  
  
+class SparseVector:
+    """
+    Lightweight SQLAlchemy-side version of SparseVector.
+    This mimics oracledb.SparseVector.
+
+    .. versionadded:: 2.0.43
+
+    """
+
+    def __init__(
+        self,
+        num_dimensions: int,
+        indices: Union[list, array.array],
+        values: Union[list, array.array],
+    ):
+        if not isinstance(indices, array.array) or indices.typecode != "I":
+            indices = array.array("I", indices)
+        if not isinstance(values, array.array):
+            values = array.array("d", values)
+        if len(indices) != len(values):
+            raise TypeError("indices and values must be of the same length!")
+
+        self.num_dimensions = num_dimensions
+        self.indices = indices
+        self.values = values
+
+    def __str__(self):
+        return (
+            f"SparseVector(num_dimensions={self.num_dimensions}, "
+            f"size={len(self.indices)}, typecode={self.values.typecode})"
+        )
+
+
  class VECTOR(types.TypeEngine):
      """Oracle VECTOR datatype.
  
@@ -196,17 +251,22 @@ class VECTOR(types.TypeEngine):
          VectorStorageFormat.FLOAT64: "d",  # Double
      }
  
-    def __init__(self, dim=None, storage_format=None):
+    def __init__(self, dim=None, storage_format=None, storage_type=None):
          """Construct a VECTOR.
  
          :param dim: integer. The dimension of the VECTOR datatype. This
           should be an integer value.
  
          :param storage_format: VectorStorageFormat. The VECTOR storage
-         type format. This may be Enum values form
+         type format. This should be Enum values form
           :class:`.VectorStorageFormat` INT8, BINARY, FLOAT32, or FLOAT64.
  
+        :param storage_type: VectorStorageType. The Vector storage type. This
+         should be Enum values from :class:`.VectorStorageType` SPARSE or
+         DENSE.
+
          """
+
          if dim is not None and not isinstance(dim, int):
              raise TypeError("dim must be an interger")
          if storage_format is not None and not isinstance(
@@ -215,12 +275,22 @@ class VECTOR(types.TypeEngine):
              raise TypeError(
                  "storage_format must be an enum of type VectorStorageFormat"
              )
+        if storage_type is not None and not isinstance(
+            storage_type, VectorStorageType
+        ):
+            raise TypeError(
+                "storage_type must be an enum of type VectorStorageType"
+            )
+
          self.dim = dim
          self.storage_format = storage_format
+        self.storage_type = storage_type
  
      def _cached_bind_processor(self, dialect):
          """
-        Convert a list to a array.array before binding it to the database.
+        Converts a Python-side SparseVector instance into an
+        oracledb.SparseVectormor a compatible array format before
+        binding it to the database.
          """
  
          def process(value):
@@ -233,20 +303,48 @@ class VECTOR(types.TypeEngine):
                  value = array.array(typecode, value)
                  return value
  
+            # Convert SqlAlchemy SparseVector to oracledb SparseVector object
+            elif isinstance(value, SparseVector):
+                return dialect.dbapi.SparseVector(
+                    value.num_dimensions,
+                    value.indices,
+                    value.values,
+                )
+
              else:
-                raise TypeError("VECTOR accepts list or array.array()")
+                raise TypeError(
+                    """
+                    Invalid input for VECTOR: expected a list, an array.array,
+                    or a SparseVector object.
+                    """
+                )
  
          return process
  
      def _cached_result_processor(self, dialect, coltype):
          """
-        Convert a array.array to list before binding it to the database.
+        Converts database-returned values into Python-native representations.
+        If the value is an oracledb.SparseVector, it is converted into the
+        SQLAlchemy-side SparseVector class.
+        If the value is a array.array, it is converted to a plain Python list.
+
          """
  
          def process(value):
-            if isinstance(value, array.array):
+            if value is None:
+                return None
+
+            elif isinstance(value, array.array):
                  return list(value)
  
+            # Convert Oracledb SparseVector to SqlAlchemy SparseVector object
+            elif isinstance(value, dialect.dbapi.SparseVector):
+                return SparseVector(
+                    num_dimensions=value.num_dimensions,
+                    indices=value.indices,
+                    values=value.values,
+                )
+
          return process
  
      def _array_typecode(self, typecode):
diff --git a/test/dialect/oracle/test_types.py b/test/dialect/oracle/test_types.py

index dc060f27e03ca77da4c8022727ca4d3b611b58e2..c905b921b90247ee4c03b08f33ae9de734a92816 100644 (file)
--- a/test/dialect/oracle/test_types.py
+++ b/test/dialect/oracle/test_types.py
@@ -39,11 +39,13 @@ from sqlalchemy import VARCHAR
  from sqlalchemy.dialects.oracle import base as oracle
  from sqlalchemy.dialects.oracle import cx_oracle
  from sqlalchemy.dialects.oracle import oracledb
+from sqlalchemy.dialects.oracle import SparseVector
  from sqlalchemy.dialects.oracle import VECTOR
  from sqlalchemy.dialects.oracle import VectorDistanceType
  from sqlalchemy.dialects.oracle import VectorIndexConfig
  from sqlalchemy.dialects.oracle import VectorIndexType
  from sqlalchemy.dialects.oracle import VectorStorageFormat
+from sqlalchemy.dialects.oracle import VectorStorageType
  from sqlalchemy.sql import column
  from sqlalchemy.sql.sqltypes import NullType
  from sqlalchemy.testing import AssertsCompiledSQL
@@ -1146,6 +1148,51 @@ class TypesTest(fixtures.TestBase):
          ).first()
          eq_(res.embedding, [1, 2, 3])
  
+    @testing.only_on("oracle>=23.7")
+    def test_sparse_vector(self, metadata, connection):
+        t1 = Table(
+            "t1",
+            metadata,
+            Column("id", Integer),
+            Column(
+                "embedding",
+                VECTOR(
+                    dim=3,
+                    storage_format=VectorStorageFormat.INT8,
+                    storage_type=VectorStorageType.SPARSE,
+                ),
+            ),
+        )
+        t1.create(connection)
+        eq_(t1.c.embedding.type.storage_type, VectorStorageType.SPARSE)
+
+    @testing.only_on("oracle>=23.7")
+    def test_sparse_vector_insert(self, metadata, connection):
+        t1 = Table(
+            "t1",
+            metadata,
+            Column("id", Integer),
+            Column(
+                "embedding",
+                VECTOR(
+                    dim=10,
+                    storage_format=VectorStorageFormat.FLOAT32,
+                    storage_type=VectorStorageType.SPARSE,
+                ),
+            ),
+        )
+        t1.create(connection)
+        sparse_vector = SparseVector(
+            10, [1, 2], array.array("f", [23.25, 221.625])
+        )
+        connection.execute(t1.insert(), dict(id=1, embedding=sparse_vector))
+        result = connection.execute(t1.select()).first()
+        eq_(result[0], 1)
+        eq_(isinstance(result[1], SparseVector), True)
+        eq_(result[1].num_dimensions, 10)
+        eq_(result[1].indices, array.array("I", [1, 2]))
+        eq_(result[1].values, array.array("f", [23.25, 221.625]))
+
  
  class LOBFetchTest(fixtures.TablesTest):
      __only_on__ = "oracle"
author	suraj <suraj.shaw@oracle.com>
	Mon, 11 Aug 2025 12:21:46 +0000 (08:21 -0400)
committer	Mike Bayer <mike_mp@zzzcomputing.com>
	Mon, 11 Aug 2025 12:34:49 +0000 (08:34 -0400)
doc/build/changelog/unreleased_20/12711.rst	[new file with mode: 0644]	patch \| blob
doc/build/dialects/oracle.rst		patch \| blob \| blame \| history
lib/sqlalchemy/dialects/oracle/__init__.py		patch \| blob \| blame \| history
lib/sqlalchemy/dialects/oracle/base.py		patch \| blob \| blame \| history
lib/sqlalchemy/dialects/oracle/vector.py		patch \| blob \| blame \| history
test/dialect/oracle/test_types.py		patch \| blob \| blame \| history