Comparison read time of RocksDb, Speedb, Lmdb
Preparation
I will use dataset and db from my previous article Comparison embedded key-value storages for python
python
import os
import json
import random
import string
from datetime import datetime
workspace_dir = '/tmp/data'
output_dir = f'{workspace_dir}/jsons'
os.makedirs(output_dir, exist_ok=True)
def generate_random_string(length):
return ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=length))
for i in range(1000000):
data = {
'text': generate_random_string(random.randint(0, 2000)),
'created_at': int(datetime.now().timestamp())
}
filename = os.path.join(output_dir, f'{i}.json')
with open(filename, 'w') as json_file:
json.dump(data, json_file)
Iterator for files:
python
import os
import json
def json_file_reader(directory):
json_files = [filename for filename in os.listdir(directory) if filename.endswith('.json')]
sorted_files = sorted(json_files, key=lambda x: int(os.path.splitext(x)[0]))
for filename in sorted_files:
file_path = os.path.join(directory, filename)
if os.path.isfile(file_path):
with open(file_path, 'r', encoding='utf-8') as json_file:
yield int(os.path.splitext(filename)[0]), json_file.read()
Measuring decorator:
python
import time
from functools import wraps
def measure(attempts=1000):
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
execution_times = []
for _ in range(attempts):
start_time = time.time() # Record the start time
result = func(*args, **kwargs) # Call the original function
end_time = time.time() # Record the end time
execution_time = (end_time - start_time) * 1000 # Convert to milliseconds
execution_times.append(execution_time)
min_time = min(execution_times)
max_time = max(execution_times)
avg_time = sum(execution_times) / attempts
print(f"Execution time of {func.__name__} over {attempts} attempts:")
print(f"Min: {min_time:.2f} ms")
print(f"Max: {max_time:.2f} ms")
print(f"Avg: {avg_time:.2f} ms")
return result
return wrapper
return decorator
Lmdb
python
import lmdb
# creating db
lmdb_dir = f'{workspace_dir}/lmdb'
# let's reserve 100Gb
with lmdb.open(lmdb_dir, 10 ** 11) as env:
with env.begin(write=True) as txn:
for i, data in json_file_reader(output_dir):
txn.put(i.to_bytes(4, 'big'), data.encode())
# reading
@measure(attempts=1000)
def read_lmdb():
with lmdb.open(lmdb_dir) as env:
with env.begin() as txn:
return txn.get(int(123456).to_bytes(4, 'big')).decode()
read_lmdb()
RocksDb
python
from rocksdict import Rdict
# creating db
rocksdict_dir = f'{workspace_dir}/rocksdb'
with Rdict(rocksdict_dir) as db:
for i, data in json_file_reader(output_dir):
db[i] = data
# reading
@measure(attempts=1000)
def read_rockdb():
with Rdict(rocksdict_dir) as db:
return db[123456]
read_rockdb()
Let's try to play a bit with different params:
RocksDb in RAW mode:
python
from rocksdict import Rdict, Options
# creating db
rocksdict_raw_dir = f'{workspace_dir}/rocksdb_raw'
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
for i, data in json_file_reader(output_dir):
db[i.to_bytes(4, 'big')] = data.encode()
# test
@measure(attempts=1000)
def read_rockdb_raw():
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
return db[int(123456).to_bytes(4, 'big')].decode()
read_rockdb_raw()
RocksDb with compression:
python
from rocksdict import Rdict, Options, DBCompressionType
# creating db
def db_options():
opt = Options()
opt.set_compression_type(DBCompressionType.zstd())
return opt
rocksdict_zstd_dir = f'{workspace_dir}/rocksdb_zstd'
with Rdict(rocksdict_zstd_dir, options=db_options()) as db:
for i, data in json_file_reader(output_dir):
db[i] = data
# test
@measure(attempts=1000)
def read_rockdb_zstd():
with Rdict(rocksdict_zstd_dir) as db:
return db[123456]
read_rockdb_zstd()
RocksDb in readonly mode:
python
from rocksdict import Rdict, AccessType
@measure(attempts=1000)
def read_rockdb():
with Rdict(rocksdict_dir, access_type=AccessType.read_only()) as db:
return db[123456]
read_rockdb()
rocksdict_dir_secondary = f'{workspace_dir}/rocksdb_secondary'
@measure(attempts=1000)
def read_rockdb():
with Rdict(rocksdict_dir, access_type=AccessType.secondary(rocksdict_dir_secondary)) as db:
return db[123456]
read_rockdb()
Speedb
For testing speedb we just need change import from rocksdict
to speedict
Result
name | occupied space | writing time | read min | read avg | read max |
---|---|---|---|---|---|
lmdb | 1.1G | 1m 26s | 0.17ms | 0.31ms | 3.68ms |
rocksdb | 1.0G | 1m 29s | 2.85ms | 6.32ms | 30.85ms |
rocksdb (raw) | 1.0G | 1m 36s | 4.04ms | 10.03ms | 37.07ms |
rocksdb (zstd) | 878M | 1m 32s | 3.52ms | 6.85ms | 18.18ms |
rocksdb (read only) | 1.0G | - | 2.41ms | 3.65ms | 10.68ms |
rocksdb (secondary) | 1.0G | - | 4.71ms | 5.47ms | 31.92ms |
speedb | 1.1G | 1m 31s | 4.23ms | 10.44ms | 101.32ms |
speedb (raw) | 1.1G | 1m 36s | 4.45ms | 15.70ms | 55.91ms |
speedb (zstd) | 876M | 1m 36s | 4.22ms | 13.49ms | 87.59ms |
speedb (read only) | 1.1G | - | 3.95ms | 5.19ms | 16.68ms |
speedb (secondary) | 1.1G | - | - | - | - |
- Speedb doesn't support secondary mode
- I am surprised that speedb has worse latency than rocksdb, especially in max column
- It is strange that rocksdb in raw mode works with worse latency
Summary
Lmdb is much faster according my results: more than 10x!