178 lines
6.7 KiB
Go
178 lines
6.7 KiB
Go
|
// Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
|
||
|
// All rights reserved.
|
||
|
//
|
||
|
// Use of this source code is governed by a BSD-style license that can be
|
||
|
// found in the LICENSE file.
|
||
|
|
||
|
// Package table allows read and write sorted key/value.
|
||
|
package table
|
||
|
|
||
|
import (
|
||
|
"encoding/binary"
|
||
|
)
|
||
|
|
||
|
/*
|
||
|
Table:
|
||
|
|
||
|
Table is consist of one or more data blocks, an optional filter block
|
||
|
a metaindex block, an index block and a table footer. Metaindex block
|
||
|
is a special block used to keep parameters of the table, such as filter
|
||
|
block name and its block handle. Index block is a special block used to
|
||
|
keep record of data blocks offset and length, index block use one as
|
||
|
restart interval. The key used by index block are the last key of preceding
|
||
|
block, shorter separator of adjacent blocks or shorter successor of the
|
||
|
last key of the last block. Filter block is an optional block contains
|
||
|
sequence of filter data generated by a filter generator.
|
||
|
|
||
|
Table data structure:
|
||
|
+ optional
|
||
|
/
|
||
|
+--------------+--------------+--------------+------+-------+-----------------+-------------+--------+
|
||
|
| data block 1 | ... | data block n | filter block | metaindex block | index block | footer |
|
||
|
+--------------+--------------+--------------+--------------+-----------------+-------------+--------+
|
||
|
|
||
|
Each block followed by a 5-bytes trailer contains compression type and checksum.
|
||
|
|
||
|
Table block trailer:
|
||
|
|
||
|
+---------------------------+-------------------+
|
||
|
| compression type (1-byte) | checksum (4-byte) |
|
||
|
+---------------------------+-------------------+
|
||
|
|
||
|
The checksum is a CRC-32 computed using Castagnoli's polynomial. Compression
|
||
|
type also included in the checksum.
|
||
|
|
||
|
Table footer:
|
||
|
|
||
|
+------------------- 40-bytes -------------------+
|
||
|
/ \
|
||
|
+------------------------+--------------------+------+-----------------+
|
||
|
| metaindex block handle / index block handle / ---- | magic (8-bytes) |
|
||
|
+------------------------+--------------------+------+-----------------+
|
||
|
|
||
|
The magic are first 64-bit of SHA-1 sum of "http://code.google.com/p/leveldb/".
|
||
|
|
||
|
NOTE: All fixed-length integer are little-endian.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
Block:
|
||
|
|
||
|
Block is consist of one or more key/value entries and a block trailer.
|
||
|
Block entry shares key prefix with its preceding key until a restart
|
||
|
point reached. A block should contains at least one restart point.
|
||
|
First restart point are always zero.
|
||
|
|
||
|
Block data structure:
|
||
|
|
||
|
+ restart point + restart point (depends on restart interval)
|
||
|
/ /
|
||
|
+---------------+---------------+---------------+---------------+---------+
|
||
|
| block entry 1 | block entry 2 | ... | block entry n | trailer |
|
||
|
+---------------+---------------+---------------+---------------+---------+
|
||
|
|
||
|
Key/value entry:
|
||
|
|
||
|
+---- key len ----+
|
||
|
/ \
|
||
|
+-------+---------+-----------+---------+--------------------+--------------+----------------+
|
||
|
| shared (varint) | not shared (varint) | value len (varint) | key (varlen) | value (varlen) |
|
||
|
+-----------------+---------------------+--------------------+--------------+----------------+
|
||
|
|
||
|
Block entry shares key prefix with its preceding key:
|
||
|
Conditions:
|
||
|
restart_interval=2
|
||
|
entry one : key=deck,value=v1
|
||
|
entry two : key=dock,value=v2
|
||
|
entry three: key=duck,value=v3
|
||
|
The entries will be encoded as follow:
|
||
|
|
||
|
+ restart point (offset=0) + restart point (offset=16)
|
||
|
/ /
|
||
|
+-----+-----+-----+----------+--------+-----+-----+-----+---------+--------+-----+-----+-----+----------+--------+
|
||
|
| 0 | 4 | 2 | "deck" | "v1" | 1 | 3 | 2 | "ock" | "v2" | 0 | 4 | 2 | "duck" | "v3" |
|
||
|
+-----+-----+-----+----------+--------+-----+-----+-----+---------+--------+-----+-----+-----+----------+--------+
|
||
|
\ / \ / \ /
|
||
|
+----------- entry one -----------+ +----------- entry two ----------+ +---------- entry three ----------+
|
||
|
|
||
|
The block trailer will contains two restart points:
|
||
|
|
||
|
+------------+-----------+--------+
|
||
|
| 0 | 16 | 2 |
|
||
|
+------------+-----------+---+----+
|
||
|
\ / \
|
||
|
+-- restart points --+ + restart points length
|
||
|
|
||
|
Block trailer:
|
||
|
|
||
|
+-- 4-bytes --+
|
||
|
/ \
|
||
|
+-----------------+-----------------+-----------------+------------------------------+
|
||
|
| restart point 1 | .... | restart point n | restart points len (4-bytes) |
|
||
|
+-----------------+-----------------+-----------------+------------------------------+
|
||
|
|
||
|
|
||
|
NOTE: All fixed-length integer are little-endian.
|
||
|
*/
|
||
|
|
||
|
/*
|
||
|
Filter block:
|
||
|
|
||
|
Filter block consist of one or more filter data and a filter block trailer.
|
||
|
The trailer contains filter data offsets, a trailer offset and a 1-byte base Lg.
|
||
|
|
||
|
Filter block data structure:
|
||
|
|
||
|
+ offset 1 + offset 2 + offset n + trailer offset
|
||
|
/ / / /
|
||
|
+---------------+---------------+---------------+---------+
|
||
|
| filter data 1 | ... | filter data n | trailer |
|
||
|
+---------------+---------------+---------------+---------+
|
||
|
|
||
|
Filter block trailer:
|
||
|
|
||
|
+- 4-bytes -+
|
||
|
/ \
|
||
|
+---------------+---------------+---------------+-------------------------------+------------------+
|
||
|
| data 1 offset | .... | data n offset | data-offsets offset (4-bytes) | base Lg (1-byte) |
|
||
|
+-------------- +---------------+---------------+-------------------------------+------------------+
|
||
|
|
||
|
|
||
|
NOTE: All fixed-length integer are little-endian.
|
||
|
*/
|
||
|
|
||
|
const (
|
||
|
blockTrailerLen = 5
|
||
|
footerLen = 48
|
||
|
|
||
|
magic = "\x57\xfb\x80\x8b\x24\x75\x47\xdb"
|
||
|
|
||
|
// The block type gives the per-block compression format.
|
||
|
// These constants are part of the file format and should not be changed.
|
||
|
blockTypeNoCompression = 0
|
||
|
blockTypeSnappyCompression = 1
|
||
|
|
||
|
// Generate new filter every 2KB of data
|
||
|
filterBaseLg = 11
|
||
|
filterBase = 1 << filterBaseLg
|
||
|
)
|
||
|
|
||
|
type blockHandle struct {
|
||
|
offset, length uint64
|
||
|
}
|
||
|
|
||
|
func decodeBlockHandle(src []byte) (blockHandle, int) {
|
||
|
offset, n := binary.Uvarint(src)
|
||
|
length, m := binary.Uvarint(src[n:])
|
||
|
if n == 0 || m == 0 {
|
||
|
return blockHandle{}, 0
|
||
|
}
|
||
|
return blockHandle{offset, length}, n + m
|
||
|
}
|
||
|
|
||
|
func encodeBlockHandle(dst []byte, b blockHandle) int {
|
||
|
n := binary.PutUvarint(dst, b.offset)
|
||
|
m := binary.PutUvarint(dst[n:], b.length)
|
||
|
return n + m
|
||
|
}
|