310 lines
7.4 KiB
Go
Vendored
310 lines
7.4 KiB
Go
Vendored
// Copyright 2014-2021 Ulrich Kunitz. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
package lzma
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
|
|
"github.com/ulikunitz/xz/internal/hash"
|
|
)
|
|
|
|
/* For compression we need to find byte sequences that match the byte
|
|
* sequence at the dictionary head. A hash table is a simple method to
|
|
* provide this capability.
|
|
*/
|
|
|
|
// maxMatches limits the number of matches requested from the Matches
|
|
// function. This controls the speed of the overall encoding.
|
|
const maxMatches = 16
|
|
|
|
// shortDists defines the number of short distances supported by the
|
|
// implementation.
|
|
const shortDists = 8
|
|
|
|
// The minimum is somehow arbitrary but the maximum is limited by the
|
|
// memory requirements of the hash table.
|
|
const (
|
|
minTableExponent = 9
|
|
maxTableExponent = 20
|
|
)
|
|
|
|
// newRoller contains the function used to create an instance of the
|
|
// hash.Roller.
|
|
var newRoller = func(n int) hash.Roller { return hash.NewCyclicPoly(n) }
|
|
|
|
// hashTable stores the hash table including the rolling hash method.
|
|
//
|
|
// We implement chained hashing into a circular buffer. Each entry in
|
|
// the circular buffer stores the delta distance to the next position with a
|
|
// word that has the same hash value.
|
|
type hashTable struct {
|
|
dict *encoderDict
|
|
// actual hash table
|
|
t []int64
|
|
// circular list data with the offset to the next word
|
|
data []uint32
|
|
front int
|
|
// mask for computing the index for the hash table
|
|
mask uint64
|
|
// hash offset; initial value is -int64(wordLen)
|
|
hoff int64
|
|
// length of the hashed word
|
|
wordLen int
|
|
// hash roller for computing the hash values for the Write
|
|
// method
|
|
wr hash.Roller
|
|
// hash roller for computing arbitrary hashes
|
|
hr hash.Roller
|
|
// preallocated slices
|
|
p [maxMatches]int64
|
|
distances [maxMatches + shortDists]int
|
|
}
|
|
|
|
// hashTableExponent derives the hash table exponent from the dictionary
|
|
// capacity.
|
|
func hashTableExponent(n uint32) int {
|
|
e := 30 - nlz32(n)
|
|
switch {
|
|
case e < minTableExponent:
|
|
e = minTableExponent
|
|
case e > maxTableExponent:
|
|
e = maxTableExponent
|
|
}
|
|
return e
|
|
}
|
|
|
|
// newHashTable creates a new hash table for words of length wordLen
|
|
func newHashTable(capacity int, wordLen int) (t *hashTable, err error) {
|
|
if !(0 < capacity) {
|
|
return nil, errors.New(
|
|
"newHashTable: capacity must not be negative")
|
|
}
|
|
exp := hashTableExponent(uint32(capacity))
|
|
if !(1 <= wordLen && wordLen <= 4) {
|
|
return nil, errors.New("newHashTable: " +
|
|
"argument wordLen out of range")
|
|
}
|
|
n := 1 << uint(exp)
|
|
if n <= 0 {
|
|
panic("newHashTable: exponent is too large")
|
|
}
|
|
t = &hashTable{
|
|
t: make([]int64, n),
|
|
data: make([]uint32, capacity),
|
|
mask: (uint64(1) << uint(exp)) - 1,
|
|
hoff: -int64(wordLen),
|
|
wordLen: wordLen,
|
|
wr: newRoller(wordLen),
|
|
hr: newRoller(wordLen),
|
|
}
|
|
return t, nil
|
|
}
|
|
|
|
func (t *hashTable) SetDict(d *encoderDict) { t.dict = d }
|
|
|
|
// buffered returns the number of bytes that are currently hashed.
|
|
func (t *hashTable) buffered() int {
|
|
n := t.hoff + 1
|
|
switch {
|
|
case n <= 0:
|
|
return 0
|
|
case n >= int64(len(t.data)):
|
|
return len(t.data)
|
|
}
|
|
return int(n)
|
|
}
|
|
|
|
// addIndex adds n to an index ensuring that is stays inside the
|
|
// circular buffer for the hash chain.
|
|
func (t *hashTable) addIndex(i, n int) int {
|
|
i += n - len(t.data)
|
|
if i < 0 {
|
|
i += len(t.data)
|
|
}
|
|
return i
|
|
}
|
|
|
|
// putDelta puts the delta instance at the current front of the circular
|
|
// chain buffer.
|
|
func (t *hashTable) putDelta(delta uint32) {
|
|
t.data[t.front] = delta
|
|
t.front = t.addIndex(t.front, 1)
|
|
}
|
|
|
|
// putEntry puts a new entry into the hash table. If there is already a
|
|
// value stored it is moved into the circular chain buffer.
|
|
func (t *hashTable) putEntry(h uint64, pos int64) {
|
|
if pos < 0 {
|
|
return
|
|
}
|
|
i := h & t.mask
|
|
old := t.t[i] - 1
|
|
t.t[i] = pos + 1
|
|
var delta int64
|
|
if old >= 0 {
|
|
delta = pos - old
|
|
if delta > 1<<32-1 || delta > int64(t.buffered()) {
|
|
delta = 0
|
|
}
|
|
}
|
|
t.putDelta(uint32(delta))
|
|
}
|
|
|
|
// WriteByte converts a single byte into a hash and puts them into the hash
|
|
// table.
|
|
func (t *hashTable) WriteByte(b byte) error {
|
|
h := t.wr.RollByte(b)
|
|
t.hoff++
|
|
t.putEntry(h, t.hoff)
|
|
return nil
|
|
}
|
|
|
|
// Write converts the bytes provided into hash tables and stores the
|
|
// abbreviated offsets into the hash table. The method will never return an
|
|
// error.
|
|
func (t *hashTable) Write(p []byte) (n int, err error) {
|
|
for _, b := range p {
|
|
// WriteByte doesn't generate an error.
|
|
t.WriteByte(b)
|
|
}
|
|
return len(p), nil
|
|
}
|
|
|
|
// getMatches the matches for a specific hash. The functions returns the
|
|
// number of positions found.
|
|
//
|
|
// TODO: Make a getDistances because that we are actually interested in.
|
|
func (t *hashTable) getMatches(h uint64, positions []int64) (n int) {
|
|
if t.hoff < 0 || len(positions) == 0 {
|
|
return 0
|
|
}
|
|
buffered := t.buffered()
|
|
tailPos := t.hoff + 1 - int64(buffered)
|
|
rear := t.front - buffered
|
|
if rear >= 0 {
|
|
rear -= len(t.data)
|
|
}
|
|
// get the slot for the hash
|
|
pos := t.t[h&t.mask] - 1
|
|
delta := pos - tailPos
|
|
for {
|
|
if delta < 0 {
|
|
return n
|
|
}
|
|
positions[n] = tailPos + delta
|
|
n++
|
|
if n >= len(positions) {
|
|
return n
|
|
}
|
|
i := rear + int(delta)
|
|
if i < 0 {
|
|
i += len(t.data)
|
|
}
|
|
u := t.data[i]
|
|
if u == 0 {
|
|
return n
|
|
}
|
|
delta -= int64(u)
|
|
}
|
|
}
|
|
|
|
// hash computes the rolling hash for the word stored in p. For correct
|
|
// results its length must be equal to t.wordLen.
|
|
func (t *hashTable) hash(p []byte) uint64 {
|
|
var h uint64
|
|
for _, b := range p {
|
|
h = t.hr.RollByte(b)
|
|
}
|
|
return h
|
|
}
|
|
|
|
// Matches fills the positions slice with potential matches. The
|
|
// functions returns the number of positions filled into positions. The
|
|
// byte slice p must have word length of the hash table.
|
|
func (t *hashTable) Matches(p []byte, positions []int64) int {
|
|
if len(p) != t.wordLen {
|
|
panic(fmt.Errorf(
|
|
"byte slice must have length %d", t.wordLen))
|
|
}
|
|
h := t.hash(p)
|
|
return t.getMatches(h, positions)
|
|
}
|
|
|
|
// NextOp identifies the next operation using the hash table.
|
|
//
|
|
// TODO: Use all repetitions to find matches.
|
|
func (t *hashTable) NextOp(rep [4]uint32) operation {
|
|
// get positions
|
|
data := t.dict.data[:maxMatchLen]
|
|
n, _ := t.dict.buf.Peek(data)
|
|
data = data[:n]
|
|
var p []int64
|
|
if n < t.wordLen {
|
|
p = t.p[:0]
|
|
} else {
|
|
p = t.p[:maxMatches]
|
|
n = t.Matches(data[:t.wordLen], p)
|
|
p = p[:n]
|
|
}
|
|
|
|
// convert positions in potential distances
|
|
head := t.dict.head
|
|
dists := append(t.distances[:0], 1, 2, 3, 4, 5, 6, 7, 8)
|
|
for _, pos := range p {
|
|
dis := int(head - pos)
|
|
if dis > shortDists {
|
|
dists = append(dists, dis)
|
|
}
|
|
}
|
|
|
|
// check distances
|
|
var m match
|
|
dictLen := t.dict.DictLen()
|
|
for _, dist := range dists {
|
|
if dist > dictLen {
|
|
continue
|
|
}
|
|
|
|
// Here comes a trick. We are only interested in matches
|
|
// that are longer than the matches we have been found
|
|
// before. So before we test the whole byte sequence at
|
|
// the given distance, we test the first byte that would
|
|
// make the match longer. If it doesn't match the byte
|
|
// to match, we don't to care any longer.
|
|
i := t.dict.buf.rear - dist + m.n
|
|
if i < 0 {
|
|
i += len(t.dict.buf.data)
|
|
}
|
|
if t.dict.buf.data[i] != data[m.n] {
|
|
// We can't get a longer match. Jump to the next
|
|
// distance.
|
|
continue
|
|
}
|
|
|
|
n := t.dict.buf.matchLen(dist, data)
|
|
switch n {
|
|
case 0:
|
|
continue
|
|
case 1:
|
|
if uint32(dist-minDistance) != rep[0] {
|
|
continue
|
|
}
|
|
}
|
|
if n > m.n {
|
|
m = match{int64(dist), n}
|
|
if n == len(data) {
|
|
// No better match will be found.
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if m.n == 0 {
|
|
return lit{data[0]}
|
|
}
|
|
return m
|
|
}
|