Skip to content

Comparação do tempo de leitura de RocksDb, Speedb, Lmdb

Preparação

Vou usar o conjunto de dados e o banco de dados do meu artigo anterior Comparação de armazenamentos incorporados de chave-valor para python

python
import os
import json
import random
import string
from datetime import datetime

workspace_dir = '/tmp/data'
output_dir = f'{workspace_dir}/jsons'
os.makedirs(output_dir, exist_ok=True)

def generate_random_string(length):
    return ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=length))

for i in range(1000000):
    data = {
        'text': generate_random_string(random.randint(0, 2000)),
        'created_at': int(datetime.now().timestamp())
    }
    filename = os.path.join(output_dir, f'{i}.json')
    with open(filename, 'w') as json_file:
        json.dump(data, json_file)

Iterador para arquivos:

python
import os
import json

def json_file_reader(directory):
    json_files = [filename for filename in os.listdir(directory) if filename.endswith('.json')]
    sorted_files = sorted(json_files, key=lambda x: int(os.path.splitext(x)[0]))
    for filename in sorted_files:
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                yield int(os.path.splitext(filename)[0]), json_file.read()

Decorador de medição:

python
import time
from functools import wraps

def measure(attempts=1000):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            execution_times = []
            for _ in range(attempts):
                start_time = time.time()  # Registro do tempo de início
                result = func(*args, **kwargs)  # Chamada da função original
                end_time = time.time()  # Registro do tempo de término
                execution_time = (end_time - start_time) * 1000  # Conversão para milissegundos
                execution_times.append(execution_time)

            min_time = min(execution_times)
            max_time = max(execution_times)
            avg_time = sum(execution_times) / attempts

            print(f"Tempo de execução de {func.__name__} em {attempts} tentativas:")
            print(f"Mín: {min_time:.2f} ms")
            print(f"Máx: {max_time:.2f} ms")
            print(f"Média: {avg_time:.2f} ms")

            return result
        return wrapper
    return decorator

Lmdb

python
import lmdb

# criando db
lmdb_dir = f'{workspace_dir}/lmdb'
# vamos reservar 100Gb
with lmdb.open(lmdb_dir, 10 ** 11) as env:
    with env.begin(write=True) as txn:
        for i, data in json_file_reader(output_dir):
            txn.put(i.to_bytes(4, 'big'), data.encode())

# leitura
@measure(attempts=1000)
def read_lmdb():
    with lmdb.open(lmdb_dir) as env:
        with env.begin() as txn:
            return txn.get(int(123456).to_bytes(4, 'big')).decode()

read_lmdb()

RocksDb

python
from rocksdict import Rdict

# criando db
rocksdict_dir = f'{workspace_dir}/rocksdb'
with Rdict(rocksdict_dir) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data

# leitura
@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir) as db:
        return db[123456]

read_rockdb()

Vamos tentar jogar um pouco com diferentes parâmetros:

RocksDb no modo RAW:

python
from rocksdict import Rdict, Options

# criando db
rocksdict_raw_dir = f'{workspace_dir}/rocksdb_raw'
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
    for i, data in json_file_reader(output_dir):
        db[i.to_bytes(4, 'big')] = data.encode()

# teste
@measure(attempts=1000)
def read_rockdb_raw():
    with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
        return db[int(123456).to_bytes(4, 'big')].decode()

read_rockdb_raw()

RocksDb com compressão:

python
from rocksdict import Rdict, Options, DBCompressionType


# criando db
def db_options():
    opt = Options()
    opt.set_compression_type(DBCompressionType.zstd())
    return opt


rocksdict_zstd_dir = f'{workspace_dir}/rocksdb_zstd'
with Rdict(rocksdict_zstd_dir, options=db_options()) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data


# teste
@measure(attempts=1000)
def read_rockdb_zstd():
    with Rdict(rocksdict_zstd_dir) as db:
        return db[123456]
read_rockdb_zstd()

RocksDb no modo somente leitura:

python
from rocksdict import Rdict, AccessType


@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.read_only()) as db:
        return db[123456]
read_rockdb()

rocksdict_dir_secondary = f'{workspace_dir}/rocksdb_secondary'

@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.secondary(rocksdict_dir_secondary)) as db:
        return db[123456]
read_rockdb()

Speedb

Para testar speedb, só precisamos mudar a importação de rocksdict para speedict

Resultado

nomeespaço ocupadotempo de escritaleitura mínleitura médialeitura máx
lmdb1.1G1m 26s0.17ms0.31ms3.68ms
rocksdb1.0G1m 29s2.85ms6.32ms30.85ms
rocksdb (raw)1.0G1m 36s4.04ms10.03ms37.07ms
rocksdb (zstd)878M1m 32s3.52ms6.85ms18.18ms
rocksdb (somente leitura)1.0G-2.41ms3.65ms10.68ms
rocksdb (secundário)1.0G-4.71ms5.47ms31.92ms
speedb1.1G1m 31s4.23ms10.44ms101.32ms
speedb (raw)1.1G1m 36s4.45ms15.70ms55.91ms
speedb (zstd)876M1m 36s4.22ms13.49ms87.59ms
speedb (somente leitura)1.1G-3.95ms5.19ms16.68ms
speedb (secundário)1.1G----
  • Speedb não suporta modo secundário
  • Estou surpreso que speedb tenha uma latência pior que rocksdb, especialmente na coluna máxima
  • É estranho que rocksdb no modo raw trabalhe com uma latência pior

Resumo

Lmdb é muito mais rápido de acordo com meus resultados: mais de 10x!