group-wbl/.venv/lib/python3.13/site-packages/whoosh/util/numlists.py
2026-01-09 09:12:25 +08:00

374 lines
10 KiB
Python

from array import array
from whoosh.compat import xrange
from whoosh.system import emptybytes
from whoosh.system import pack_byte, unpack_byte
from whoosh.system import pack_ushort_le, unpack_ushort_le
from whoosh.system import pack_uint_le, unpack_uint_le
def delta_encode(nums):
base = 0
for n in nums:
yield n - base
base = n
def delta_decode(nums):
base = 0
for n in nums:
base += n
yield base
class GrowableArray(object):
def __init__(self, inittype="B", allow_longs=True):
self.array = array(inittype)
self._allow_longs = allow_longs
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.array)
def __len__(self):
return len(self.array)
def __iter__(self):
return iter(self.array)
def _retype(self, maxnum):
if maxnum < 2 ** 16:
newtype = "H"
elif maxnum < 2 ** 31:
newtype = "i"
elif maxnum < 2 ** 32:
newtype = "I"
elif self._allow_longs:
newtype = "q"
else:
raise OverflowError("%r is too big to fit in an array" % maxnum)
try:
self.array = array(newtype, iter(self.array))
except ValueError:
self.array = list(self.array)
def append(self, n):
try:
self.array.append(n)
except OverflowError:
self._retype(n)
self.array.append(n)
def extend(self, ns):
append = self.append
for n in ns:
append(n)
@property
def typecode(self):
if isinstance(self.array, array):
return self.array.typecode
else:
return "q"
def to_file(self, dbfile):
if isinstance(self.array, array):
dbfile.write_array(self.array)
else:
write_long = dbfile.write_long
for n in self.array:
write_long(n)
# Number list encoding base class
class NumberEncoding(object):
maxint = None
def write_nums(self, f, numbers):
raise NotImplementedError
def read_nums(self, f, n):
raise NotImplementedError
def write_deltas(self, f, numbers):
return self.write_nums(f, list(delta_encode(numbers)))
def read_deltas(self, f, n):
return delta_decode(self.read_nums(f, n))
def get(self, f, pos, i):
f.seek(pos)
n = None
for n in self.read_nums(f, i + 1):
pass
return n
# Fixed width encodings
class FixedEncoding(NumberEncoding):
_encode = None
_decode = None
size = None
def write_nums(self, f, numbers):
_encode = self._encode
for n in numbers:
f.write(_encode(n))
def read_nums(self, f, n):
_decode = self._decode
for _ in xrange(n):
yield _decode(f.read(self.size))
def get(self, f, pos, i):
f.seek(pos + i * self.size)
return self._decode(f.read(self.size))
class ByteEncoding(FixedEncoding):
size = 1
maxint = 255
_encode = pack_byte
_decode = unpack_byte
class UShortEncoding(FixedEncoding):
size = 2
maxint = 2 ** 16 - 1
_encode = pack_ushort_le
_decode = unpack_ushort_le
class UIntEncoding(FixedEncoding):
size = 4
maxint = 2 ** 32 - 1
_encode = pack_uint_le
_decode = unpack_uint_le
# High-bit encoded variable-length integer
class Varints(NumberEncoding):
maxint = None
def write_nums(self, f, numbers):
for n in numbers:
f.write_varint(n)
def read_nums(self, f, n):
for _ in xrange(n):
yield f.read_varint()
# Simple16 algorithm for storing arrays of positive integers (usually delta
# encoded lists of sorted integers)
#
# 1. http://www2008.org/papers/pdf/p387-zhangA.pdf
# 2. http://www2009.org/proceedings/pdf/p401.pdf
class Simple16(NumberEncoding):
# The maximum possible integer value Simple16 can encode is < 2^28.
# Therefore, in order to use Simple16, the application must have its own
# code to encode numbers in the range of [2^28, 2^32). A simple way is just
# write those numbers as 32-bit integers (that is, no compression for very
# big numbers).
_numsize = 16
_bitsize = 28
maxint = 2 ** _bitsize - 1
# Number of stored numbers per code
_num = [28, 21, 21, 21, 14, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 1]
# Number of bits for each number per code
_bits = [
(1,) * 28,
(2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1),
(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2),
(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2),
(4, 3, 3, 3, 3, 3, 3, 3, 3),
(3, 4, 4, 4, 4, 3, 3, 3),
(4, 4, 4, 4, 4, 4, 4),
(5, 5, 5, 5, 4, 4),
(4, 4, 5, 5, 5, 5),
(6, 6, 6, 5, 5),
(5, 5, 6, 6, 6),
(7, 7, 7, 7),
(10, 9, 9),
(14, 14),
(28,),
]
def write_nums(self, f, numbers):
_compress = self._compress
i = 0
while i < len(numbers):
value, taken = _compress(numbers, i, len(numbers) - i)
f.write_uint_le(value)
i += taken
def _compress(self, inarray, inoffset, n):
_numsize = self._numsize
_bitsize = self._bitsize
_num = self._num
_bits = self._bits
for key in xrange(_numsize):
value = key << _bitsize
num = _num[key] if _num[key] < n else n
bits = 0
j = 0
while j < num and inarray[inoffset + j] < (1 << _bits[key][j]):
x = inarray[inoffset + j]
value |= x << bits
bits += _bits[key][j]
j += 1
if j == num:
return value, num
raise Exception
def read_nums(self, f, n):
_decompress = self._decompress
i = 0
while i < n:
value = unpack_uint_le(f.read(4))[0]
for v in _decompress(value, n - i):
yield v
i += 1
def _decompress(self, value, n):
_numsize = self._numsize
_bitsize = self._bitsize
_num = self._num
_bits = self._bits
key = value >> _bitsize
num = _num[key] if _num[key] < n else n
bits = 0
for j in xrange(num):
v = value >> bits
yield v & (0xffffffff >> (32 - _bits[key][j]))
bits += _bits[key][j]
def get(self, f, pos, i):
f.seek(pos)
base = 0
value = unpack_uint_le(f.read(4))
key = value >> self._bitsize
num = self._num[key]
while i > base + num:
base += num
value = unpack_uint_le(f.read(4))
key = value >> self._bitsize
num = self._num[key]
offset = i - base
if offset:
value = value >> sum(self._bits[key][:offset])
return value & (2 ** self._bits[key][offset] - 1)
# Google Packed Ints algorithm: a set of four numbers is preceded by a "key"
# byte, which encodes how many bytes each of the next four integers use
# (stored in the byte as four 2-bit numbers)
class GInts(NumberEncoding):
maxint = 2 ** 32 - 1
# Number of future bytes to expect after a "key" byte value of N -- used to
# skip ahead from a key byte
_lens = array("B", [4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 5, 6,
7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9,
10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
12, 13, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 6, 7, 8, 9, 7,
8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10,
11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11,
12, 13, 14, 6, 7, 8, 9, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 7, 8, 9,
10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 8, 9, 10, 11, 9, 10, 11,
12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12,
13, 14, 12, 13, 14, 15, 7, 8, 9, 10, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11,
12, 13, 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 9, 10,
11, 12, 10, 11, 12, 13, 11, 12, 13, 14, 12, 13, 14, 15, 10, 11, 12, 13, 11,
12, 13, 14, 12, 13, 14, 15, 13, 14, 15, 16])
def key_to_sizes(self, key):
"""Returns a list of the sizes of the next four numbers given a key
byte.
"""
return [(key >> (i * 2) & 3) + 1 for i in xrange(4)]
def write_nums(self, f, numbers):
buf = emptybytes
count = 0
key = 0
for v in numbers:
shift = count * 2
if v < 256:
buf += pack_byte(v)
elif v < 65536:
key |= 1 << shift
buf += pack_ushort_le(v)
elif v < 16777216:
key |= 2 << shift
buf += pack_uint_le(v)[:3]
else:
key |= 3 << shift
buf += pack_uint_le(v)
count += 1
if count == 4:
f.write_byte(key)
f.write(buf)
count = 0
key = 0
buf = emptybytes # Clear the buffer
# Write out leftovers in the buffer
if count:
f.write_byte(key)
f.write(buf)
def read_nums(self, f, n):
"""Read N integers from the bytes stream dbfile. Expects that the file
is positioned at a key byte.
"""
count = 0
key = None
for _ in xrange(n):
if count == 0:
key = f.read_byte()
code = key >> (count * 2) & 3
if code == 0:
yield f.read_byte()
elif code == 1:
yield f.read_ushort_le()
elif code == 2:
yield unpack_uint_le(f.read(3) + "\x00")[0]
else:
yield f.read_uint_le()
count = (count + 1) % 4
# def get(self, f, pos, i):
# f.seek(pos)
# base = 0
# key = f.read_byte()
# while i > base + 4:
# base += 4
# f.seek(self._lens[key], 1)
# key = f.read_byte()
#
# for n in self.read_nums(f, (i + 1) - base):
# pass
# return n