Comparación del tiempo de lectura de RocksDb, Speedb, Lmdb
Preparación
Utilizaré el conjunto de datos y la base de datos de mi artículo anterior Comparación de almacenamiento de clave-valor embebido para python
python
import os
import json
import random
import string
from datetime import datetime
workspace_dir = '/tmp/data'
output_dir = f'{workspace_dir}/jsons'
os.makedirs(output_dir, exist_ok=True)
def generate_random_string(length):
return ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=length))
for i in range(1000000):
data = {
'text': generate_random_string(random.randint(0, 2000)),
'created_at': int(datetime.now().timestamp())
}
filename = os.path.join(output_dir, f'{i}.json')
with open(filename, 'w') as json_file:
json.dump(data, json_file)
Iterador para archivos:
python
import os
import json
def json_file_reader(directory):
json_files = [filename for filename in os.listdir(directory) if filename.endswith('.json')]
sorted_files = sorted(json_files, key=lambda x: int(os.path.splitext(x)[0]))
for filename in sorted_files:
file_path = os.path.join(directory, filename)
if os.path.isfile(file_path):
with open(file_path, 'r', encoding='utf-8') as json_file:
yield int(os.path.splitext(filename)[0]), json_file.read()
Medición decorador:
python
import time
from functools import wraps
def measure(attempts=1000):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
execution_times = []
for _ in range(attempts):
start_time = time.time() # Registrar el tiempo de inicio
result = func(*args, **kwargs) # Llamar a la función original
end_time = time.time() # Registrar el tiempo de finalización
execution_time = (end_time - start_time) * 1000 # Convertir a milisegundos
execution_times.append(execution_time)
min_time = min(execution_times)
max_time = max(execution_times)
avg_time = sum(execution_times) / attempts
print(f"Tiempo de ejecución de {func.__name__} sobre {attempts} intentos:")
print(f"Mínimo: {min_time:.2f} ms")
print(f"Máximo: {max_time:.2f} ms")
print(f"Promedio: {avg_time:.2f} ms")
return result
return wrapper
return decorator
Lmdb
python
import lmdb
# creando la base de datos
lmdb_dir = f'{workspace_dir}/lmdb'
# reservemos 100 Gb
with lmdb.open(lmdb_dir, 10 ** 11) as env:
with env.begin(write=True) as txn:
for i, data in json_file_reader(output_dir):
txn.put(i.to_bytes(4, 'big'), data.encode())
# lectura
@measure(attempts=1000)
def read_lmdb():
with lmdb.open(lmdb_dir) as env:
with env.begin() as txn:
return txn.get(int(123456).to_bytes(4, 'big')).decode()
read_lmdb()
RocksDb
python
from rocksdict import Rdict
# creando la base de datos
rocksdict_dir = f'{workspace_dir}/rocksdb'
with Rdict(rocksdict_dir) as db:
for i, data in json_file_reader(output_dir):
db[i] = data
# lectura
@measure(attempts=1000)
def read_rockdb():
with Rdict(rocksdict_dir) as db:
return db[123456]
read_rockdb()
Vamos a probar un poco con diferentes parámetros:
RocksDb en modo RAW:
python
from rocksdict import Rdict, Options
# creando la base de datos
rocksdict_raw_dir = f'{workspace_dir}/rocksdb_raw'
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
for i, data in json_file_reader(output_dir):
db[i.to_bytes(4, 'big')] = data.encode()
# prueba
@measure(attempts=1000)
def read_rockdb_raw():
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
return db[int(123456).to_bytes(4, 'big')].decode()
read_rockdb_raw()
RocksDb con compresión:
python
from rocksdict import Rdict, Options, DBCompressionType
# creando la base de datos
def db_options():
opt = Options()
opt.set_compression_type(DBCompressionType.zstd())
return opt
rocksdict_zstd_dir = f'{workspace_dir}/rocksdb_zstd'
with Rdict(rocksdict_zstd_dir, options=db_options()) as db:
for i, data in json_file_reader(output_dir):
db[i] = data
# prueba
@measure(attempts=1000)
def read_rockdb_zstd():
with Rdict(rocksdict_zstd_dir) as db:
return db[123456]
read_rockdb_zstd()
RocksDb en modo de solo lectura:
python
from rocksdict import Rdict, AccessType
@measure(attempts=1000)
def read_rockdb():
with Rdict(rocksdict_dir, access_type=AccessType.read_only()) as db:
return db[123456]
read_rockdb()
rocksdict_dir_secondary = f'{workspace_dir}/rocksdb_secondary'
@measure(attempts=1000)
def read_rockdb():
with Rdict(rocksdict_dir, access_type=AccessType.secondary(rocksdict_dir_secondary)) as db:
return db[123456]
read_rockdb()
Speedb
Para probar speedb solo necesitamos cambiar la importación de rocksdict
a speedict
Resultado
nombre | espacio ocupado | tiempo de escritura | lectura mín | lectura prom | lectura máx |
---|---|---|---|---|---|
lmdb | 1.1G | 1m 26s | 0.17ms | 0.31ms | 3.68ms |
rocksdb | 1.0G | 1m 29s | 2.85ms | 6.32ms | 30.85ms |
rocksdb (raw) | 1.0G | 1m 36s | 4.04ms | 10.03ms | 37.07ms |
rocksdb (zstd) | 878M | 1m 32s | 3.52ms | 6.85ms | 18.18ms |
rocksdb (solo lectura) | 1.0G | - | 2.41ms | 3.65ms | 10.68ms |
rocksdb (secundario) | 1.0G | - | 4.71ms | 5.47ms | 31.92ms |
speedb | 1.1G | 1m 31s | 4.23ms | 10.44ms | 101.32ms |
speedb (raw) | 1.1G | 1m 36s | 4.45ms | 15.70ms | 55.91ms |
speedb (zstd) | 876M | 1m 36s | 4.22ms | 13.49ms | 87.59ms |
speedb (solo lectura) | 1.1G | - | 3.95ms | 5.19ms | 16.68ms |
speedb (secundario) | 1.1G | - | - | - | - |
- Speedb no soporta modo secundario
- Estoy sorprendido de que speedb tenga una latencia peor que rocksdb, especialmente en la columna máxima
- Es extraño que rocksdb en modo raw funcione con peor latencia
Resumen
Lmdb es mucho más rápido según mis resultados: ¡más de 10x!