Skip to content

Fixing gaps for Containers with HashV1 #41222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3121,9 +3121,11 @@ def __GetBodiesFromQueryResult(result: Dict[str, Any]) -> List[Dict[str, Any]]:
if isPrefixPartitionQuery:
last_response_headers = CaseInsensitiveDict()
# here get the over lapping ranges
partition_key_definition = kwargs.pop("partitionKeyDefinition", None)
pk_properties = partition_key_definition
partition_key_definition = PartitionKey(path=pk_properties["paths"], kind=pk_properties["kind"])
pk_properties: Optional[PartitionKey] = kwargs.pop("partitionKeyDefinition", None)
partition_key_definition = PartitionKey(
path=pk_properties["paths"],
kind=pk_properties["kind"],
version=pk_properties["version"])
partition_key_value = pk_properties["partition_key"]
feedrangeEPK = partition_key_definition._get_epk_range_for_prefix_partition_key(
partition_key_value
Expand Down
42 changes: 42 additions & 0 deletions sdk/cosmos/azure-cosmos/azure/cosmos/_cosmos_murmurhash3.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,45 @@ def murmurhash3_128(span: bytearray, seed: _UInt128) -> _UInt128: # pylint: dis
h2 += h1

return _UInt128(int(h1.value), int(h2.value))

def murmurhash3_32(data: bytearray, length: int, seed: int) -> int:
c1: int = 0xcc9e2d51
c2: int = 0x1b873593

h1: int = seed
rounded_end: int = (length & 0xfffffffc) # round down to 4 byte block

for i in range(0, rounded_end, 4):
# little endian load order
k1: int = (data[i] & 0xff) | ((data[i + 1] & 0xff) << 8) | ((data[i + 2] & 0xff) << 16) | (
data[i + 3] << 24)
k1 *= c1
k1 = (k1 << 15) | (k1 >> 17) # ROTL32(k1,15)
k1 *= c2

h1 ^= k1
h1 = (h1 << 13) | (h1 >> 19) # ROTL32(h1,13)
h1 = h1 * 5 + 0xe6546b64

# tail
k1: int = 0
if length & 0x03 == 3:
k1 = (data[rounded_end + 2] & 0xff) << 16
if length & 0x03 >= 2:
k1 |= (data[rounded_end + 1] & 0xff) << 8
if length & 0x03 >= 1:
k1 |= (data[rounded_end] & 0xff)
k1 *= c1
k1 = (k1 << 15) | (k1 >> 17)
k1 *= c2
h1 ^= k1

# finalization
h1 ^= length
h1 ^= h1 >> 16
h1 *= 0x85ebca6b
h1 ^= h1 >> 13
h1 *= 0xc2b2ae35
h1 ^= h1 >> 16

return h1
5 changes: 4 additions & 1 deletion sdk/cosmos/azure-cosmos/azure/cosmos/aio/_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@ async def _get_epk_range_for_partition_key(self, partition_key_value: PartitionK

container_properties = await self._get_properties()
partition_key_definition = container_properties["partitionKey"]
partition_key = PartitionKey(path=partition_key_definition["paths"], kind=partition_key_definition["kind"])
partition_key = PartitionKey(
path=partition_key_definition["paths"],
kind=partition_key_definition["kind"],
version=partition_key_definition["version"])

return partition_key._get_epk_range_for_partition_key(partition_key_value)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2914,7 +2914,10 @@ def __GetBodiesFromQueryResult(result: Dict[str, Any]) -> List[Dict[str, Any]]:
if cont_prop:
cont_prop = await cont_prop()
pk_properties = cont_prop["partitionKey"]
partition_key_definition = PartitionKey(path=pk_properties["paths"], kind=pk_properties["kind"])
partition_key_definition = PartitionKey(
path=pk_properties["paths"],
kind=pk_properties["kind"],
version=pk_properties["version"])
if partition_key_definition.kind == "MultiHash" and \
(isinstance(partition_key, List) and \
len(partition_key_definition['paths']) != len(partition_key)):
Expand Down
10 changes: 8 additions & 2 deletions sdk/cosmos/azure-cosmos/azure/cosmos/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,10 @@ def _set_partition_key(
def _get_epk_range_for_partition_key( self, partition_key_value: PartitionKeyType) -> Range:
container_properties = self._get_properties()
partition_key_definition = container_properties["partitionKey"]
partition_key = PartitionKey(path=partition_key_definition["paths"], kind=partition_key_definition["kind"])
partition_key = PartitionKey(
path=partition_key_definition["paths"],
kind=partition_key_definition["kind"],
version=partition_key_definition["version"])

return partition_key._get_epk_range_for_partition_key(partition_key_value)

Expand Down Expand Up @@ -715,7 +718,10 @@ def __is_prefix_partitionkey(
self, partition_key: PartitionKeyType) -> bool:
properties = self._get_properties()
pk_properties = properties["partitionKey"]
partition_key_definition = PartitionKey(path=pk_properties["paths"], kind=pk_properties["kind"])
partition_key_definition = PartitionKey(
path=pk_properties["paths"],
kind=pk_properties["kind"],
version=pk_properties["version"])
return partition_key_definition._is_prefix_partition_key(partition_key)


Expand Down
65 changes: 53 additions & 12 deletions sdk/cosmos/azure-cosmos/azure/cosmos/partition_key.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from typing_extensions import Literal

from ._cosmos_integers import _UInt64, _UInt128
from ._cosmos_murmurhash3 import murmurhash3_128 as _murmurhash3_128
from ._cosmos_murmurhash3 import murmurhash3_128 as _murmurhash3_128, murmurhash3_32 as _murmurhash3_32
from ._routing.routing_range import Range as _Range


Expand Down Expand Up @@ -187,9 +187,34 @@ def _get_epk_range_for_partition_key(
cast(List[Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]]], [pk_value]))
return _Range(effective_partition_key_string, effective_partition_key_string, True, True)

def _get_effective_partition_key_for_hash_partitioning(self) -> str:
# We shouldn't be supporting V1
return ""
@staticmethod
def _as_unsigned_long(x: int) -> int: return x & 0xFFFFFFFF

@staticmethod
def _truncate_for_v1_hashing(value: Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]]) -> Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]]:
if isinstance(value, str):
return value[:100]

return value

@staticmethod
def _get_effective_partition_key_for_hash_partitioning(
pk_value: Sequence[Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]]]
) -> str:
with (BytesIO() as ms):
truncated_components: List[Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]]] = \
[None] + [PartitionKey._truncate_for_v1_hashing(v) for v in pk_value]

for component in truncated_components[1:]:
PartitionKey._write_for_hashing(component, ms)

ms_bytes: bytes = ms.getvalue()
hash_as_int: int = _murmurhash3_32(bytearray(ms_bytes),len(bytes), 0)
Copy link
Preview

Copilot AI May 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The call to murmurhash3_32 uses len(bytes) instead of the actual buffer length. Replace len(bytes) with len(ms_bytes) to hash the correct number of bytes.

Suggested change
hash_as_int: int = _murmurhash3_32(bytearray(ms_bytes),len(bytes), 0)
hash_as_int: int = _murmurhash3_32(bytearray(ms_bytes), len(ms_bytes), 0)

Copilot uses AI. Check for mistakes.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll go ahead and change this a bit, we can set the length variable inside _murmurhash3_32 based on the bytearray we passed in similar to the 128 murmurhash.

hash_value = float(PartitionKey._as_unsigned_long(hash_as_int))

truncated_components[0] = hash_value

return _to_hex_encoded_binary_string(truncated_components)

def _get_effective_partition_key_string(
self,
Expand All @@ -205,17 +230,32 @@ def _get_effective_partition_key_string(
if kind == 'Hash':
version = self.version or 2
if version == 1:
return self._get_effective_partition_key_for_hash_partitioning()
return PartitionKey._get_effective_partition_key_for_hash_partitioning(pk_value)
if version == 2:
return self._get_effective_partition_key_for_hash_partitioning_v2(pk_value)
return PartitionKey._get_effective_partition_key_for_hash_partitioning_v2(pk_value)
elif kind == 'MultiHash':
return self._get_effective_partition_key_for_multi_hash_partitioning_v2(pk_value)
return _to_hex_encoded_binary_string(pk_value)

@staticmethod
def _write_for_hashing(
value: Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]],
writer: IO[bytes]
) -> None:
PartitionKey._write_for_hashing_core(value, bytes([0]), writer)

@staticmethod
def _write_for_hashing_v2(
self,
value: Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]],
writer: IO[bytes]
) -> None:
PartitionKey._write_for_hashing_core(value, bytes([0xFF]), writer)

@staticmethod
def _write_for_hashing_core(
value: Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]],
string_suffix: bytes,
writer: IO[bytes]
) -> None:
if value is True:
writer.write(bytes([_PartitionKeyComponentType.PTrue]))
Expand All @@ -232,17 +272,18 @@ def _write_for_hashing_v2(
elif isinstance(value, str):
writer.write(bytes([_PartitionKeyComponentType.String]))
writer.write(value.encode('utf-8'))
writer.write(bytes([0xFF]))
writer.write(string_suffix)
elif isinstance(value, _Undefined):
writer.write(bytes([_PartitionKeyComponentType.Undefined]))


@staticmethod
def _get_effective_partition_key_for_hash_partitioning_v2(
self,
pk_value: Sequence[Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]]]
) -> str:
with BytesIO() as ms:
for component in pk_value:
self._write_for_hashing_v2(component, ms)
PartitionKey._write_for_hashing_v2(component, ms)

ms_bytes = ms.getvalue()
hash128 = _murmurhash3_128(bytearray(ms_bytes), _UInt128(0, 0))
Expand All @@ -255,8 +296,8 @@ def _get_effective_partition_key_for_hash_partitioning_v2(

return ''.join('{:02X}'.format(x) for x in hash_bytes)

@staticmethod
def _get_effective_partition_key_for_multi_hash_partitioning_v2(
self,
pk_value: Sequence[Union[None, bool, int, float, str, _Undefined, Type[NonePartitionKeyValue]]]
) -> str:
sb = []
Expand All @@ -265,7 +306,7 @@ def _get_effective_partition_key_for_multi_hash_partitioning_v2(
binary_writer = ms # In Python, you can write bytes directly to a BytesIO object

# Assuming paths[i] is the correct object to call write_for_hashing_v2 on
self._write_for_hashing_v2(value, binary_writer)
PartitionKey._write_for_hashing_v2(value, binary_writer)

ms_bytes = ms.getvalue()
hash128 = _murmurhash3_128(bytearray(ms_bytes), _UInt128(0, 0))
Expand Down
Loading