Skip to content

Comparaison du temps de lecture de RocksDb, Speedb, Lmdb

Préparation

Je vais utiliser le jeu de données et la base de données de mon article précédent Comparaison des stockages clé-valeur intégrés pour python

python
import os
import json
import random
import string
from datetime import datetime

workspace_dir = '/tmp/data'
output_dir = f'{workspace_dir}/jsons'
os.makedirs(output_dir, exist_ok=True)

def generate_random_string(length):
    return ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=length))

for i in range(1000000):
    data = {
        'text': generate_random_string(random.randint(0, 2000)),
        'created_at': int(datetime.now().timestamp())
    }
    filename = os.path.join(output_dir, f'{i}.json')
    with open(filename, 'w') as json_file:
        json.dump(data, json_file)

Itérateur pour les fichiers :

python
import os
import json

def json_file_reader(directory):
    json_files = [filename for filename in os.listdir(directory) if filename.endswith('.json')]
    sorted_files = sorted(json_files, key=lambda x: int(os.path.splitext(x)[0]))
    for filename in sorted_files:
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                yield int(os.path.splitext(filename)[0]), json_file.read()

Décorateur de mesure :

python
import time
from functools import wraps

def measure(attempts=1000):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            execution_times = []
            for _ in range(attempts):
                start_time = time.time()  # Enregistrer le temps de début
                result = func(*args, **kwargs)  # Appeler la fonction originale
                end_time = time.time()  # Enregistrer le temps de fin
                execution_time = (end_time - start_time) * 1000  # Convertir en millisecondes
                execution_times.append(execution_time)

            min_time = min(execution_times)
            max_time = max(execution_times)
            avg_time = sum(execution_times) / attempts

            print(f"Temps d'exécution de {func.__name__} sur {attempts} tentatives:")
            print(f"Min: {min_time:.2f} ms")
            print(f"Max: {max_time:.2f} ms")
            print(f"Moyenne: {avg_time:.2f} ms")

            return result
        return wrapper
    return decorator

Lmdb

python
import lmdb

# création de la base de données
lmdb_dir = f'{workspace_dir}/lmdb'
# réservons 100Gb
with lmdb.open(lmdb_dir, 10 ** 11) as env:
    with env.begin(write=True) as txn:
        for i, data in json_file_reader(output_dir):
            txn.put(i.to_bytes(4, 'big'), data.encode())

# lecture
@measure(attempts=1000)
def read_lmdb():
    with lmdb.open(lmdb_dir) as env:
        with env.begin() as txn:
            return txn.get(int(123456).to_bytes(4, 'big')).decode()

read_lmdb()

RocksDb

python
from rocksdict import Rdict

# création de la base de données
rocksdict_dir = f'{workspace_dir}/rocksdb'
with Rdict(rocksdict_dir) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data

# lecture
@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir) as db:
        return db[123456]

read_rockdb()

Jouons un peu avec différents paramètres :

RocksDb en mode RAW :

python
from rocksdict import Rdict, Options

# création de la base de données
rocksdict_raw_dir = f'{workspace_dir}/rocksdb_raw'
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
    for i, data in json_file_reader(output_dir):
        db[i.to_bytes(4, 'big')] = data.encode()

# test
@measure(attempts=1000)
def read_rockdb_raw():
    with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
        return db[int(123456).to_bytes(4, 'big')].decode()

read_rockdb_raw()

RocksDb avec compression :

python
from rocksdict import Rdict, Options, DBCompressionType


# création de la base de données
def db_options():
    opt = Options()
    opt.set_compression_type(DBCompressionType.zstd())
    return opt


rocksdict_zstd_dir = f'{workspace_dir}/rocksdb_zstd'
with Rdict(rocksdict_zstd_dir, options=db_options()) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data


# test
@measure(attempts=1000)
def read_rockdb_zstd():
    with Rdict(rocksdict_zstd_dir) as db:
        return db[123456]
read_rockdb_zstd()

RocksDb en mode lecture seule :

python
from rocksdict import Rdict, AccessType


@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.read_only()) as db:
        return db[123456]
read_rockdb()

rocksdict_dir_secondary = f'{workspace_dir}/rocksdb_secondary'

@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.secondary(rocksdict_dir_secondary)) as db:
        return db[123456]
read_rockdb()

Speedb

Pour tester speedb, il suffit de changer l'importation de rocksdict à speedict

Résultat

nomespace occupétemps d'écriturelecture minlecture moyennelecture max
lmdb1.1G1m 26s0.17ms0.31ms3.68ms
rocksdb1.0G1m 29s2.85ms6.32ms30.85ms
rocksdb (raw)1.0G1m 36s4.04ms10.03ms37.07ms
rocksdb (zstd)878M1m 32s3.52ms6.85ms18.18ms
rocksdb (lecture seule)1.0G-2.41ms3.65ms10.68ms
rocksdb (secondary)1.0G-4.71ms5.47ms31.92ms
speedb1.1G1m 31s4.23ms10.44ms101.32ms
speedb (raw)1.1G1m 36s4.45ms15.70ms55.91ms
speedb (zstd)876M1m 36s4.22ms13.49ms87.59ms
speedb (lecture seule)1.1G-3.95ms5.19ms16.68ms
speedb (secondary)1.1G----
  • Speedb ne supporte pas le mode secondaire
  • Je suis surpris que speedb ait une latence plus mauvaise que rocksdb, surtout dans la colonne max
  • Il est étrange que rocksdb en mode brut fonctionne avec une latence plus mauvaise

Résumé

Lmdb est beaucoup plus rapide selon mes résultats : plus de 10x !