Skip to content

Comparison read time of RocksDb, Speedb, Lmdb

Preparation

I will use dataset and db from my previous article Comparison embedded key-value storages for python

python
import os
import json
import random
import string
from datetime import datetime

workspace_dir = '/tmp/data'
output_dir = f'{workspace_dir}/jsons'
os.makedirs(output_dir, exist_ok=True)

def generate_random_string(length):
    return ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=length))

for i in range(1000000):
    data = {
        'text': generate_random_string(random.randint(0, 2000)),
        'created_at': int(datetime.now().timestamp())
    }
    filename = os.path.join(output_dir, f'{i}.json')
    with open(filename, 'w') as json_file:
        json.dump(data, json_file)

Iterator for files:

python
import os
import json

def json_file_reader(directory):
    json_files = [filename for filename in os.listdir(directory) if filename.endswith('.json')]
    sorted_files = sorted(json_files, key=lambda x: int(os.path.splitext(x)[0]))
    for filename in sorted_files:
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                yield int(os.path.splitext(filename)[0]), json_file.read()

Measuring decorator:

python
import time
from functools import wraps

def measure(attempts=1000):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            execution_times = []
            for _ in range(attempts):
                start_time = time.time()  # Record the start time
                result = func(*args, **kwargs)  # Call the original function
                end_time = time.time()  # Record the end time
                execution_time = (end_time - start_time) * 1000  # Convert to milliseconds
                execution_times.append(execution_time)

            min_time = min(execution_times)
            max_time = max(execution_times)
            avg_time = sum(execution_times) / attempts

            print(f"Execution time of {func.__name__} over {attempts} attempts:")
            print(f"Min: {min_time:.2f} ms")
            print(f"Max: {max_time:.2f} ms")
            print(f"Avg: {avg_time:.2f} ms")

            return result
        return wrapper
    return decorator

Lmdb

python
import lmdb

# creating db
lmdb_dir = f'{workspace_dir}/lmdb'
# let's reserve 100Gb
with lmdb.open(lmdb_dir, 10 ** 11) as env:
    with env.begin(write=True) as txn:
        for i, data in json_file_reader(output_dir):
            txn.put(i.to_bytes(4, 'big'), data.encode())

# reading
@measure(attempts=1000)
def read_lmdb():
    with lmdb.open(lmdb_dir) as env:
        with env.begin() as txn:
            return txn.get(int(123456).to_bytes(4, 'big')).decode()

read_lmdb()

RocksDb

python
from rocksdict import Rdict

# creating db
rocksdict_dir = f'{workspace_dir}/rocksdb'
with Rdict(rocksdict_dir) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data

# reading
@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir) as db:
        return db[123456]

read_rockdb()

Let's try to play a bit with different params:

RocksDb in RAW mode:

python
from rocksdict import Rdict, Options

# creating db
rocksdict_raw_dir = f'{workspace_dir}/rocksdb_raw'
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
    for i, data in json_file_reader(output_dir):
        db[i.to_bytes(4, 'big')] = data.encode()

# test
@measure(attempts=1000)
def read_rockdb_raw():
    with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
        return db[int(123456).to_bytes(4, 'big')].decode()

read_rockdb_raw()

RocksDb with compression:

python
from rocksdict import Rdict, Options, DBCompressionType


# creating db
def db_options():
    opt = Options()
    opt.set_compression_type(DBCompressionType.zstd())
    return opt


rocksdict_zstd_dir = f'{workspace_dir}/rocksdb_zstd'
with Rdict(rocksdict_zstd_dir, options=db_options()) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data


# test
@measure(attempts=1000)
def read_rockdb_zstd():
    with Rdict(rocksdict_zstd_dir) as db:
        return db[123456]
read_rockdb_zstd()

RocksDb in readonly mode:

python
from rocksdict import Rdict, AccessType


@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.read_only()) as db:
        return db[123456]
read_rockdb()

rocksdict_dir_secondary = f'{workspace_dir}/rocksdb_secondary'

@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.secondary(rocksdict_dir_secondary)) as db:
        return db[123456]
read_rockdb()

Speedb

For testing speedb we just need change import from rocksdict to speedict

Result

nameoccupied spacewriting timeread minread avgread max
lmdb1.1G1m 26s0.17ms0.31ms3.68ms
rocksdb1.0G1m 29s2.85ms6.32ms30.85ms
rocksdb (raw)1.0G1m 36s4.04ms10.03ms37.07ms
rocksdb (zstd)878M1m 32s3.52ms6.85ms18.18ms
rocksdb (read only)1.0G-2.41ms3.65ms10.68ms
rocksdb (secondary)1.0G-4.71ms5.47ms31.92ms
speedb1.1G1m 31s4.23ms10.44ms101.32ms
speedb (raw)1.1G1m 36s4.45ms15.70ms55.91ms
speedb (zstd)876M1m 36s4.22ms13.49ms87.59ms
speedb (read only)1.1G-3.95ms5.19ms16.68ms
speedb (secondary)1.1G----
  • Speedb doesn't support secondary mode
  • I am surprised that speedb has worse latency than rocksdb, especially in max column
  • It is strange that rocksdb in raw mode works with worse latency

Summary

Lmdb is much faster according my results: more than 10x!