Comparison read time of RocksDb, Speedb, Lmdb

Preparation

I will use dataset and db from my previous article Comparison embedded key-value storages for python

python

import os
import json
import random
import string
from datetime import datetime

workspace_dir = '/tmp/data'
output_dir = f'{workspace_dir}/jsons'
os.makedirs(output_dir, exist_ok=True)

def generate_random_string(length):
    return ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=length))

for i in range(1000000):
    data = {
        'text': generate_random_string(random.randint(0, 2000)),
        'created_at': int(datetime.now().timestamp())
    }
    filename = os.path.join(output_dir, f'{i}.json')
    with open(filename, 'w') as json_file:
        json.dump(data, json_file)

Iterator for files:

python

import os
import json

def json_file_reader(directory):
    json_files = [filename for filename in os.listdir(directory) if filename.endswith('.json')]
    sorted_files = sorted(json_files, key=lambda x: int(os.path.splitext(x)[0]))
    for filename in sorted_files:
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                yield int(os.path.splitext(filename)[0]), json_file.read()

Measuring decorator:

python

import time
from functools import wraps

def measure(attempts=1000):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            execution_times = []
            for _ in range(attempts):
                start_time = time.time()  # Record the start time
                result = func(*args, **kwargs)  # Call the original function
                end_time = time.time()  # Record the end time
                execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
                execution_times.append(execution_time)

            min_time = min(execution_times)
            max_time = max(execution_times)
            avg_time = sum(execution_times) / attempts

            print(f"Execution time of {func.__name__} over {attempts} attempts:")
            print(f"Min: {min_time:.2f} ms")
            print(f"Max: {max_time:.2f} ms")
            print(f"Avg: {avg_time:.2f} ms")

            return result
        return wrapper
    return decorator

Lmdb

python

import lmdb

# creating db
lmdb_dir = f'{workspace_dir}/lmdb'
# let's reserve 100Gb
with lmdb.open(lmdb_dir, 10 ** 11) as env:
    with env.begin(write=True) as txn:
        for i, data in json_file_reader(output_dir):
            txn.put(i.to_bytes(4, 'big'), data.encode())

# reading
@measure(attempts=1000)
def read_lmdb():
    with lmdb.open(lmdb_dir) as env:
        with env.begin() as txn:
            return txn.get(int(123456).to_bytes(4, 'big')).decode()

read_lmdb()

RocksDb

python

from rocksdict import Rdict

# creating db
rocksdict_dir = f'{workspace_dir}/rocksdb'
with Rdict(rocksdict_dir) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data

# reading
@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir) as db:
        return db[123456]

read_rockdb()

Let's try to play a bit with different params:

RocksDb in RAW mode:

python

from rocksdict import Rdict, Options

# creating db
rocksdict_raw_dir = f'{workspace_dir}/rocksdb_raw'
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
    for i, data in json_file_reader(output_dir):
        db[i.to_bytes(4, 'big')] = data.encode()

# test
@measure(attempts=1000)
def read_rockdb_raw():
    with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
        return db[int(123456).to_bytes(4, 'big')].decode()

read_rockdb_raw()

RocksDb with compression:

python

from rocksdict import Rdict, Options, DBCompressionType


# creating db
def db_options():
    opt = Options()
    opt.set_compression_type(DBCompressionType.zstd())
    return opt


rocksdict_zstd_dir = f'{workspace_dir}/rocksdb_zstd'
with Rdict(rocksdict_zstd_dir, options=db_options()) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data


# test
@measure(attempts=1000)
def read_rockdb_zstd():
    with Rdict(rocksdict_zstd_dir) as db:
        return db[123456]
read_rockdb_zstd()

RocksDb in readonly mode:

python

from rocksdict import Rdict, AccessType


@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.read_only()) as db:
        return db[123456]
read_rockdb()

rocksdict_dir_secondary = f'{workspace_dir}/rocksdb_secondary'

@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.secondary(rocksdict_dir_secondary)) as db:
        return db[123456]
read_rockdb()

Speedb

For testing speedb we just need change import from rocksdict to speedict

Result

name	occupied space	writing time	read min	read avg	read max
lmdb	1.1G	1m 26s	0.17ms	0.31ms	3.68ms
rocksdb	1.0G	1m 29s	2.85ms	6.32ms	30.85ms
rocksdb (raw)	1.0G	1m 36s	4.04ms	10.03ms	37.07ms
rocksdb (zstd)	878M	1m 32s	3.52ms	6.85ms	18.18ms
rocksdb (read only)	1.0G	-	2.41ms	3.65ms	10.68ms
rocksdb (secondary)	1.0G	-	4.71ms	5.47ms	31.92ms
speedb	1.1G	1m 31s	4.23ms	10.44ms	101.32ms
speedb (raw)	1.1G	1m 36s	4.45ms	15.70ms	55.91ms
speedb (zstd)	876M	1m 36s	4.22ms	13.49ms	87.59ms
speedb (read only)	1.1G	-	3.95ms	5.19ms	16.68ms
speedb (secondary)	1.1G	-	-	-	-

Speedb doesn't support secondary mode
I am surprised that speedb has worse latency than rocksdb, especially in max column
It is strange that rocksdb in raw mode works with worse latency

Summary

Lmdb is much faster according my results: more than 10x!

Comparison read time of RocksDb, Speedb, Lmdb ​

Preparation ​

Lmdb ​

RocksDb ​

RocksDb in RAW mode: ​

RocksDb with compression: ​

RocksDb in readonly mode: ​

Speedb ​

Result ​

Summary ​

Comparison read time of RocksDb, Speedb, Lmdb

Preparation

Lmdb

RocksDb

RocksDb in RAW mode:

RocksDb with compression:

RocksDb in readonly mode:

Speedb

Result

Summary