RocksDb, Speedb, Lmdb 读取时间比较

准备

我将使用我之前文章中的数据集和数据库 Comparison embedded key-value storages for python

python

import os
import json
import random
import string
from datetime import datetime

workspace_dir = '/tmp/data'
output_dir = f'{workspace_dir}/jsons'
os.makedirs(output_dir, exist_ok=True)

def generate_random_string(length):
    return ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=length))

for i in range(1000000):
    data = {
        'text': generate_random_string(random.randint(0, 2000)),
        'created_at': int(datetime.now().timestamp())
    }
    filename = os.path.join(output_dir, f'{i}.json')
    with open(filename, 'w') as json_file:
        json.dump(data, json_file)

文件迭代器：

python

import os
import json

def json_file_reader(directory):
    json_files = [filename for filename in os.listdir(directory) if filename.endswith('.json')]
    sorted_files = sorted(json_files, key=lambda x: int(os.path.splitext(x)[0]))
    for filename in sorted_files:
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                yield int(os.path.splitext(filename)[0]), json_file.read()

测量装饰器：

python

import time
from functools import wraps

def measure(attempts=1000):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            execution_times = []
            for _ in range(attempts):
                start_time = time.time()  # 记录开始时间
                result = func(*args, **kwargs)  # 调用原函数
                end_time = time.time()  # 记录结束时间
                execution_time = (end_time - start_time) * 1000  # 转换为毫秒
                execution_times.append(execution_time)

            min_time = min(execution_times)
            max_time = max(execution_times)
            avg_time = sum(execution_times) / attempts

            print(f"{func.__name__} 在 {attempts} 次尝试中的执行时间:")
            print(f"最小时间: {min_time:.2f} ms")
            print(f"最大时间: {max_time:.2f} ms")
            print(f"平均时间: {avg_time:.2f} ms")

            return result
        return wrapper
    return decorator

Lmdb

python

import lmdb

# 创建数据库
lmdb_dir = f'{workspace_dir}/lmdb'
# 预留100Gb
with lmdb.open(lmdb_dir, 10 ** 11) as env:
    with env.begin(write=True) as txn:
        for i, data in json_file_reader(output_dir):
            txn.put(i.to_bytes(4, 'big'), data.encode())

# 读取
@measure(attempts=1000)
def read_lmdb():
    with lmdb.open(lmdb_dir) as env:
        with env.begin() as txn:
            return txn.get(int(123456).to_bytes(4, 'big')).decode()

read_lmdb()

RocksDb

python

from rocksdict import Rdict

# 创建数据库
rocksdict_dir = f'{workspace_dir}/rocksdb'
with Rdict(rocksdict_dir) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data

# 读取
@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir) as db:
        return db[123456]

read_rockdb()

让我们尝试一些不同的参数：

RocksDb 的 RAW 模式：

python

from rocksdict import Rdict, Options

# 创建数据库
rocksdict_raw_dir = f'{workspace_dir}/rocksdb_raw'
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
    for i, data in json_file_reader(output_dir):
        db[i.to_bytes(4, 'big')] = data.encode()

# 测试
@measure(attempts=1000)
def read_rockdb_raw():
    with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
        return db[int(123456).to_bytes(4, 'big')].decode()

read_rockdb_raw()

带压缩的 RocksDb：

python

from rocksdict import Rdict, Options, DBCompressionType


# 创建数据库
def db_options():
    opt = Options()
    opt.set_compression_type(DBCompressionType.zstd())
    return opt


rocksdict_zstd_dir = f'{workspace_dir}/rocksdb_zstd'
with Rdict(rocksdict_zstd_dir, options=db_options()) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data


# 测试
@measure(attempts=1000)
def read_rockdb_zstd():
    with Rdict(rocksdict_zstd_dir) as db:
        return db[123456]
read_rockdb_zstd()

RocksDb 只读模式：

python

from rocksdict import Rdict, AccessType


@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.read_only()) as db:
        return db[123456]
read_rockdb()

rocksdict_dir_secondary = f'{workspace_dir}/rocksdb_secondary'

@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.secondary(rocksdict_dir_secondary)) as db:
        return db[123456]
read_rockdb()

Speedb

测试 Speedb 只需将rocksdict导入改为speedict

结果

名称	占用空间	写入时间	读取最小	读取平均	读取最大
lmdb	1.1G	1m 26s	0.17ms	0.31ms	3.68ms
rocksdb	1.0G	1m 29s	2.85ms	6.32ms	30.85ms
rocksdb (raw)	1.0G	1m 36s	4.04ms	10.03ms	37.07ms
rocksdb (zstd)	878M	1m 32s	3.52ms	6.85ms	18.18ms
rocksdb (read only)	1.0G	-	2.41ms	3.65ms	10.68ms
rocksdb (secondary)	1.0G	-	4.71ms	5.47ms	31.92ms
speedb	1.1G	1m 31s	4.23ms	10.44ms	101.32ms
speedb (raw)	1.1G	1m 36s	4.45ms	15.70ms	55.91ms
speedb (zstd)	876M	1m 36s	4.22ms	13.49ms	87.59ms
speedb (read only)	1.1G	-	3.95ms	5.19ms	16.68ms
speedb (secondary)	1.1G	-	-	-	-

Speedb 不支持二级模式
我很惊讶 Speedb 的最大延迟比 RocksDb 差
奇怪的是 RocksDb 在原始模式下的延迟表现较差

总结

根据我的结果，Lmdb 更快：快了超过10倍！

RocksDb, Speedb, Lmdb 读取时间比较 ​

准备 ​

Lmdb ​

RocksDb ​

RocksDb 的 RAW 模式： ​

带压缩的 RocksDb： ​

RocksDb 只读模式： ​

Speedb ​

结果 ​

总结 ​

RocksDb, Speedb, Lmdb 读取时间比较

准备

Lmdb

RocksDb

RocksDb 的 RAW 模式：

带压缩的 RocksDb：

RocksDb 只读模式：

Speedb

结果

总结