Source code for scap.cdblib

# -*- coding: utf-8 -*-
"""
| Imported from: `python-pure-cdb <https://python-pure-cdb.googlecode.com/>`_
| Author: David Wilson
| License: MIT

Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables
that efficiently handle many keys, while remaining space-efficient.

    http://cr.yp.to/cdb.html

When generated databases are only used with Python code, consider using hash()
rather than djb_hash() for a tidy speedup.

.. note::
    Minor alterations made to comply with PEP8 style check and to remove
    attempt to import C implementation of djb_hash. -- bd808, 2014-03-04
"""
from itertools import chain
from _struct import Struct

import sys


if sys.version_info.major == 2:

    def py_djb_hash(s):
        """
        Return the value of DJB's hash function for the given 8-bit string.

        >>> py_djb_hash('')
        5381
        >>> py_djb_hash('\x01')
        177572
        >>> py_djb_hash('€')
        193278953
        """
        h = 5381
        for c in s:
            h = (((h << 5) + h) ^ ord(c)) & 0xFFFFFFFF
        return h

else:

[docs] def py_djb_hash(s): """ Return the value of DJB's hash function for the given 8-bit string. >>> py_djb_hash('') 5381 >>> py_djb_hash('\x01') 177572 >>> py_djb_hash('€') 193278953 """ h = 5381 for c in s.encode("UTF-8"): h = (((h << 5) + h) ^ c) & 0xFFFFFFFF return h
# 2014-03-04 bd808: removed try block for importing C hash implementation DJB_HASH = py_djb_hash READ_2_LE4 = Struct("<LL").unpack WRITE_2_LE4 = Struct("<LL").pack
[docs]class Reader(object): """ A dictionary-like object for reading a Constant Database. Reader accesses through a string or string-like sequence such as mmap.mmap(). """
[docs] def __init__(self, data, hashfn=DJB_HASH): """ Create an instance reading from a sequence and hash keys using hashfn. >>> Reader(data=b'') Traceback (most recent call last): ... OSError: CDB too small >>> Reader(data=b'a' * 2048) #doctest: +ELLIPSIS <scap.cdblib.Reader object at 0x...> """ if len(data) < 2048: raise OSError("CDB too small") self.data = data self.hashfn = hashfn self.index = [READ_2_LE4(data[i : i + 8]) for i in range(0, 2048, 8)] self.table_start = min(p[0] for p in self.index) # Assume load factor is 0.5 like official CDB. self.length = sum(p[1] >> 1 for p in self.index)
[docs] def iteritems(self): """Like dict.iteritems(). Items are returned in insertion order.""" pos = 2048 while pos < self.table_start: klen, dlen = READ_2_LE4(self.data[pos : pos + 8]) pos += 8 key = self.data[pos : pos + klen] pos += klen data = self.data[pos : pos + dlen] pos += dlen key = key.decode("UTF-8") data = data.decode("UTF-8") yield key, data
[docs] def items(self): """Like dict.items().""" return list(self.iteritems())
[docs]class Writer(object): """Object for building new Constant Databases, and writing them to a seekable file-like object."""
[docs] def __init__(self, fp, hashfn=DJB_HASH): """ Create an instance writing to a file-like object and hash keys. It uses hashfn to hash keys. >>> import tempfile >>> temp_fp = tempfile.TemporaryFile() >>> Writer(fp=temp_fp, hashfn=py_djb_hash) #doctest: +ELLIPSIS <scap.cdblib.Writer object at 0x...> """ self.fp = fp self.hashfn = hashfn fp.write(b"\x00" * 2048) self._unordered = [[] for i in range(256)]
[docs] def put(self, key, value=""): """Write a string key/value pair to the output file.""" assert isinstance(key, str) and isinstance(value, str) # Compute the hash of the key while it is still a string h = self.hashfn(key) & 0xFFFFFFFF # Convert key/value strings to bytes key = key.encode("UTF-8") value = value.encode("UTF-8") pos = self.fp.tell() self.fp.write(WRITE_2_LE4(len(key), len(value))) self.fp.write(key) self.fp.write(value) self._unordered[h & 0xFF].append((h, pos))
[docs] def finalize(self): """Write the final hash tables to the output file, and write out its index. The output file remains open upon return.""" index = [] for tbl in self._unordered: length = len(tbl) << 1 ordered = [(0, 0)] * length for pair in tbl: where = (pair[0] >> 8) % length for i in chain(range(where, length), range(0, where)): if not ordered[i][0]: ordered[i] = pair break index.append((self.fp.tell(), length)) for pair in ordered: self.fp.write(WRITE_2_LE4(*pair)) self.fp.seek(0) for pair in index: self.fp.write(WRITE_2_LE4(*pair)) self.fp = None # prevent double finalize()