Skip to content

RocksDb, Speedb, Lmdb 读取时间比较

准备

我将使用我之前文章中的数据集和数据库 Comparison embedded key-value storages for python

python
import os
import json
import random
import string
from datetime import datetime

workspace_dir = '/tmp/data'
output_dir = f'{workspace_dir}/jsons'
os.makedirs(output_dir, exist_ok=True)

def generate_random_string(length):
    return ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=length))

for i in range(1000000):
    data = {
        'text': generate_random_string(random.randint(0, 2000)),
        'created_at': int(datetime.now().timestamp())
    }
    filename = os.path.join(output_dir, f'{i}.json')
    with open(filename, 'w') as json_file:
        json.dump(data, json_file)

文件迭代器:

python
import os
import json

def json_file_reader(directory):
    json_files = [filename for filename in os.listdir(directory) if filename.endswith('.json')]
    sorted_files = sorted(json_files, key=lambda x: int(os.path.splitext(x)[0]))
    for filename in sorted_files:
        file_path = os.path.join(directory, filename)
        if os.path.isfile(file_path):
            with open(file_path, 'r', encoding='utf-8') as json_file:
                yield int(os.path.splitext(filename)[0]), json_file.read()

测量装饰器:

python
import time
from functools import wraps

def measure(attempts=1000):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            execution_times = []
            for _ in range(attempts):
                start_time = time.time()  # 记录开始时间
                result = func(*args, **kwargs)  # 调用原函数
                end_time = time.time()  # 记录结束时间
                execution_time = (end_time - start_time) * 1000  # 转换为毫秒
                execution_times.append(execution_time)

            min_time = min(execution_times)
            max_time = max(execution_times)
            avg_time = sum(execution_times) / attempts

            print(f"{func.__name__}{attempts} 次尝试中的执行时间:")
            print(f"最小时间: {min_time:.2f} ms")
            print(f"最大时间: {max_time:.2f} ms")
            print(f"平均时间: {avg_time:.2f} ms")

            return result
        return wrapper
    return decorator

Lmdb

python
import lmdb

# 创建数据库
lmdb_dir = f'{workspace_dir}/lmdb'
# 预留100Gb
with lmdb.open(lmdb_dir, 10 ** 11) as env:
    with env.begin(write=True) as txn:
        for i, data in json_file_reader(output_dir):
            txn.put(i.to_bytes(4, 'big'), data.encode())

# 读取
@measure(attempts=1000)
def read_lmdb():
    with lmdb.open(lmdb_dir) as env:
        with env.begin() as txn:
            return txn.get(int(123456).to_bytes(4, 'big')).decode()

read_lmdb()

RocksDb

python
from rocksdict import Rdict

# 创建数据库
rocksdict_dir = f'{workspace_dir}/rocksdb'
with Rdict(rocksdict_dir) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data

# 读取
@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir) as db:
        return db[123456]

read_rockdb()

让我们尝试一些不同的参数:

RocksDb 的 RAW 模式:

python
from rocksdict import Rdict, Options

# 创建数据库
rocksdict_raw_dir = f'{workspace_dir}/rocksdb_raw'
with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
    for i, data in json_file_reader(output_dir):
        db[i.to_bytes(4, 'big')] = data.encode()

# 测试
@measure(attempts=1000)
def read_rockdb_raw():
    with Rdict(rocksdict_raw_dir, options=Options(raw_mode=True)) as db:
        return db[int(123456).to_bytes(4, 'big')].decode()

read_rockdb_raw()

带压缩的 RocksDb:

python
from rocksdict import Rdict, Options, DBCompressionType


# 创建数据库
def db_options():
    opt = Options()
    opt.set_compression_type(DBCompressionType.zstd())
    return opt


rocksdict_zstd_dir = f'{workspace_dir}/rocksdb_zstd'
with Rdict(rocksdict_zstd_dir, options=db_options()) as db:
    for i, data in json_file_reader(output_dir):
        db[i] = data


# 测试
@measure(attempts=1000)
def read_rockdb_zstd():
    with Rdict(rocksdict_zstd_dir) as db:
        return db[123456]
read_rockdb_zstd()

RocksDb 只读模式:

python
from rocksdict import Rdict, AccessType


@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.read_only()) as db:
        return db[123456]
read_rockdb()

rocksdict_dir_secondary = f'{workspace_dir}/rocksdb_secondary'

@measure(attempts=1000)
def read_rockdb():
    with Rdict(rocksdict_dir, access_type=AccessType.secondary(rocksdict_dir_secondary)) as db:
        return db[123456]
read_rockdb()

Speedb

测试 Speedb 只需将rocksdict导入改为speedict

结果

名称占用空间写入时间读取最小读取平均读取最大
lmdb1.1G1m 26s0.17ms0.31ms3.68ms
rocksdb1.0G1m 29s2.85ms6.32ms30.85ms
rocksdb (raw)1.0G1m 36s4.04ms10.03ms37.07ms
rocksdb (zstd)878M1m 32s3.52ms6.85ms18.18ms
rocksdb (read only)1.0G-2.41ms3.65ms10.68ms
rocksdb (secondary)1.0G-4.71ms5.47ms31.92ms
speedb1.1G1m 31s4.23ms10.44ms101.32ms
speedb (raw)1.1G1m 36s4.45ms15.70ms55.91ms
speedb (zstd)876M1m 36s4.22ms13.49ms87.59ms
speedb (read only)1.1G-3.95ms5.19ms16.68ms
speedb (secondary)1.1G----
  • Speedb 不支持二级模式
  • 我很惊讶 Speedb 的最大延迟比 RocksDb 差
  • 奇怪的是 RocksDb 在原始模式下的延迟表现较差

总结

根据我的结果,Lmdb 更快:快了超过10倍!