Skip to content

Comparación del tiempo de lectura de RocksDb, Speedb, Lmdb

Preparación

Utilizaré el conjunto de datos y la base de datos de mi artículo anterior Comparación de almacenamiento de clave-valor embebido para python

python
import os
import json
import random
import string
from datetime import datetime

workspace_dir = '/tmp/data'
output_dir = f'{workspace_dir}/jsons'
os.makedirs(output_dir, exist_ok=True)

def generate_random_string(length):
    return ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=length))

for i in range(1000000):
    data = {
        'text': generate_random_string(random.randint(0, 2000)),
        'created_at': int(datetime.now().timestamp())
    }
    filename = os.path.join(output_dir, f'{i}.json')
    with open(filename, 'w') as json_file:
        json.dump(data, json_file)

Iterador para archivos:

python
import os
import json

def json_file_reader(directory):
    json_files = [filename for filename in os.listdir(directory) if filename.endswith('.json')]
    sorted_files = sorted(json_files, key=lambda x: int(os.path.splitext(x)[0]))
    for filename in sorted_files:
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                yield int(os.path.splitext(filename)[0]), json_file.read()

Medición decorador:

python
import time
from functools import wraps

def measure(attempts=1000):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            execution_times = []
            for _ in range(attempts):
                start_time = time.time()  # Registrar el tiempo de inicio
                result = func(*args, **kwargs)  # Llamar a la función original
                end_time = time.time()  # Registrar el tiempo de finalización
                execution_time = (end_time - start_time) * 1000  # Convertir a milisegundos
                execution_times.append(execution_time)

            min_time = min(execution_times)
            max_time = max(execution_times)
            avg_time = sum(execution_times) / attempts

            print(f"Tiempo de ejecución de {func.__name__} sobre {attempts} intentos:")
            print(f"Mínimo: {min_time:.2f} ms")
            print(f"Máximo: {max_time:.2f} ms")
            print(f"Promedio: {avg_time:.2f} ms")

            return result
        return wrapper
    return decorator

Lmdb

python
import lmdb

# creando la base de datos
lmdb_dir = f'{workspace_dir}/lmdb'
# reservemos 100 Gb
with lmdb.open(lmdb_dir, 10 ** 11) as env:
    with env.begin(write=True) as txn:
        for i, data in json_file_reader(output_dir):
            txn.put(i.to_bytes(4, 'big'), data.encode())

# lectura
@measure(attempts=1000)
def read_lmdb():
    with lmdb.open(lmdb_dir) as env:
        with env.begin() as txn:
            return txn.get(int(123456).to_bytes(4, 'big')).decode()

read_lmdb()

RocksDb

python
from rocksdict import Rdict

# creando la base de datos
rocksdict_dir = f'{workspace_dir}/rocksdb'
with Rdict(rocksdict_dir) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data

# lectura
@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir) as db:
        return db[123456]

read_rockdb()

Vamos a probar un poco con diferentes parámetros:

RocksDb en modo RAW:

python
from rocksdict import Rdict, Options

# creando la base de datos
rocksdict_raw_dir = f'{workspace_dir}/rocksdb_raw'
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
    for i, data in json_file_reader(output_dir):
        db[i.to_bytes(4, 'big')] = data.encode()

# prueba
@measure(attempts=1000)
def read_rockdb_raw():
    with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
        return db[int(123456).to_bytes(4, 'big')].decode()

read_rockdb_raw()

RocksDb con compresión:

python
from rocksdict import Rdict, Options, DBCompressionType


# creando la base de datos
def db_options():
    opt = Options()
    opt.set_compression_type(DBCompressionType.zstd())
    return opt


rocksdict_zstd_dir = f'{workspace_dir}/rocksdb_zstd'
with Rdict(rocksdict_zstd_dir, options=db_options()) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data


# prueba
@measure(attempts=1000)
def read_rockdb_zstd():
    with Rdict(rocksdict_zstd_dir) as db:
        return db[123456]
read_rockdb_zstd()

RocksDb en modo de solo lectura:

python
from rocksdict import Rdict, AccessType


@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.read_only()) as db:
        return db[123456]
read_rockdb()

rocksdict_dir_secondary = f'{workspace_dir}/rocksdb_secondary'

@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.secondary(rocksdict_dir_secondary)) as db:
        return db[123456]
read_rockdb()

Speedb

Para probar speedb solo necesitamos cambiar la importación de rocksdict a speedict

Resultado

nombreespacio ocupadotiempo de escrituralectura mínlectura promlectura máx
lmdb1.1G1m 26s0.17ms0.31ms3.68ms
rocksdb1.0G1m 29s2.85ms6.32ms30.85ms
rocksdb (raw)1.0G1m 36s4.04ms10.03ms37.07ms
rocksdb (zstd)878M1m 32s3.52ms6.85ms18.18ms
rocksdb (solo lectura)1.0G-2.41ms3.65ms10.68ms
rocksdb (secundario)1.0G-4.71ms5.47ms31.92ms
speedb1.1G1m 31s4.23ms10.44ms101.32ms
speedb (raw)1.1G1m 36s4.45ms15.70ms55.91ms
speedb (zstd)876M1m 36s4.22ms13.49ms87.59ms
speedb (solo lectura)1.1G-3.95ms5.19ms16.68ms
speedb (secundario)1.1G----
  • Speedb no soporta modo secundario
  • Estoy sorprendido de que speedb tenga una latencia peor que rocksdb, especialmente en la columna máxima
  • Es extraño que rocksdb en modo raw funcione con peor latencia

Resumen

Lmdb es mucho más rápido según mis resultados: ¡más de 10x!