mirror of https://github.com/go-gitea/gitea.git
Update bleve dependency to latest master revision (#6100)
* update bleve to master b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2 * remove unused pkg from dep file * change bleve from master to recent revision
This commit is contained in:
parent
11e316654e
commit
a380cfd8e0
|
@ -40,14 +40,6 @@
|
||||||
revision = "1a28a7fa985680f9f4e1644c0a857ec359a444b0"
|
revision = "1a28a7fa985680f9f4e1644c0a857ec359a444b0"
|
||||||
version = "v0.4.7"
|
version = "v0.4.7"
|
||||||
|
|
||||||
[[projects]]
|
|
||||||
branch = "master"
|
|
||||||
digest = "1:93367b6d47a8ccc7d14f9f493ccf103ccf5afb698559ff8e8f1999427ce27ace"
|
|
||||||
name = "github.com/Smerity/govarint"
|
|
||||||
packages = ["."]
|
|
||||||
pruneopts = "NUT"
|
|
||||||
revision = "7265e41f48f15fd61751e16da866af3c704bb3ab"
|
|
||||||
|
|
||||||
[[projects]]
|
[[projects]]
|
||||||
branch = "master"
|
branch = "master"
|
||||||
digest = "1:d290f4b25abbf574f80f60c8a5603ddada784f13f436b91a9a927bc7ce5a0146"
|
digest = "1:d290f4b25abbf574f80f60c8a5603ddada784f13f436b91a9a927bc7ce5a0146"
|
||||||
|
@ -98,7 +90,8 @@
|
||||||
revision = "3a771d992973f24aa725d07868b467d1ddfceafb"
|
revision = "3a771d992973f24aa725d07868b467d1ddfceafb"
|
||||||
|
|
||||||
[[projects]]
|
[[projects]]
|
||||||
digest = "1:c10f35be6200b09e26da267ca80f837315093ecaba27e7a223071380efb9dd32"
|
branch = "master"
|
||||||
|
digest = "1:b17287a86f6cac923a5d886e10618df994eeb54b6724eac2e3b8dde89cfbe3a2"
|
||||||
name = "github.com/blevesearch/bleve"
|
name = "github.com/blevesearch/bleve"
|
||||||
packages = [
|
packages = [
|
||||||
".",
|
".",
|
||||||
|
@ -121,7 +114,6 @@
|
||||||
"index/scorch",
|
"index/scorch",
|
||||||
"index/scorch/mergeplan",
|
"index/scorch/mergeplan",
|
||||||
"index/scorch/segment",
|
"index/scorch/segment",
|
||||||
"index/scorch/segment/mem",
|
|
||||||
"index/scorch/segment/zap",
|
"index/scorch/segment/zap",
|
||||||
"index/store",
|
"index/store",
|
||||||
"index/store/boltdb",
|
"index/store/boltdb",
|
||||||
|
@ -141,9 +133,10 @@
|
||||||
"search/query",
|
"search/query",
|
||||||
"search/scorer",
|
"search/scorer",
|
||||||
"search/searcher",
|
"search/searcher",
|
||||||
|
"size",
|
||||||
]
|
]
|
||||||
pruneopts = "NUT"
|
pruneopts = "NUT"
|
||||||
revision = "c74e08f039e56cef576e4336382b2a2d12d9e026"
|
revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5"
|
||||||
|
|
||||||
[[projects]]
|
[[projects]]
|
||||||
branch = "master"
|
branch = "master"
|
||||||
|
@ -160,14 +153,6 @@
|
||||||
pruneopts = "NUT"
|
pruneopts = "NUT"
|
||||||
revision = "db70c57796cc8c310613541dfade3dce627d09c7"
|
revision = "db70c57796cc8c310613541dfade3dce627d09c7"
|
||||||
|
|
||||||
[[projects]]
|
|
||||||
digest = "1:c7e0968c05659f3973148cd5c5387d6ee960a6ae1b2eaaec0b1d435d806458bb"
|
|
||||||
name = "github.com/boltdb/bolt"
|
|
||||||
packages = ["."]
|
|
||||||
pruneopts = "NUT"
|
|
||||||
revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7"
|
|
||||||
source = "github.com/go-gitea/bolt"
|
|
||||||
|
|
||||||
[[projects]]
|
[[projects]]
|
||||||
digest = "1:7c96cf7bf7f52af67f7a8222185813b9b665f5172ec2ac5f7d49ed96e5fcf3e5"
|
digest = "1:7c96cf7bf7f52af67f7a8222185813b9b665f5172ec2ac5f7d49ed96e5fcf3e5"
|
||||||
name = "github.com/boombuler/barcode"
|
name = "github.com/boombuler/barcode"
|
||||||
|
@ -217,15 +202,16 @@
|
||||||
|
|
||||||
[[projects]]
|
[[projects]]
|
||||||
branch = "master"
|
branch = "master"
|
||||||
digest = "1:82e1ad11d777f7bff9a1fc678a8a534a318f85e5026a8a4d6f4a94a6b0678bb6"
|
digest = "1:6a658ac7d23204dc743c7155557c45273747d78e05ae0579742bd6b744bce215"
|
||||||
name = "github.com/couchbase/vellum"
|
name = "github.com/couchbase/vellum"
|
||||||
packages = [
|
packages = [
|
||||||
".",
|
".",
|
||||||
|
"levenshtein2",
|
||||||
"regexp",
|
"regexp",
|
||||||
"utf8",
|
"utf8",
|
||||||
]
|
]
|
||||||
pruneopts = "NUT"
|
pruneopts = "NUT"
|
||||||
revision = "eb6ae3743b3f300f2136f83ca78c08cc071edbd4"
|
revision = "e91b68ff3efe3cc11723aa25dd315cbc9276cd65"
|
||||||
|
|
||||||
[[projects]]
|
[[projects]]
|
||||||
branch = "master"
|
branch = "master"
|
||||||
|
@ -287,6 +273,14 @@
|
||||||
revision = "1615341f118ae12f353cc8a983f35b584342c9b3"
|
revision = "1615341f118ae12f353cc8a983f35b584342c9b3"
|
||||||
version = "v1.12.0"
|
version = "v1.12.0"
|
||||||
|
|
||||||
|
[[projects]]
|
||||||
|
digest = "1:ae8eea1a24ae43a46c2e96631b6303fcc4210ca0ac9d643e4da965029d1b511d"
|
||||||
|
name = "github.com/etcd-io/bbolt"
|
||||||
|
packages = ["."]
|
||||||
|
pruneopts = "NUT"
|
||||||
|
revision = "63597a96ec0ad9e6d43c3fc81e809909e0237461"
|
||||||
|
version = "v1.3.2"
|
||||||
|
|
||||||
[[projects]]
|
[[projects]]
|
||||||
digest = "1:8603f74d35c93b37c615a02ba297be2cf2efc9ff6f1ff2b458a903990b568e48"
|
digest = "1:8603f74d35c93b37c615a02ba297be2cf2efc9ff6f1ff2b458a903990b568e48"
|
||||||
name = "github.com/ethantkoenig/rupture"
|
name = "github.com/ethantkoenig/rupture"
|
||||||
|
|
|
@ -15,10 +15,8 @@ ignored = ["google.golang.org/appengine*"]
|
||||||
name = "code.gitea.io/sdk"
|
name = "code.gitea.io/sdk"
|
||||||
|
|
||||||
[[constraint]]
|
[[constraint]]
|
||||||
# branch = "master"
|
revision = "05d86ea8f6e30456949f612cf68cf4a27ce8c9c5"
|
||||||
revision = "c74e08f039e56cef576e4336382b2a2d12d9e026"
|
|
||||||
name = "github.com/blevesearch/bleve"
|
name = "github.com/blevesearch/bleve"
|
||||||
#Not targetting v0.7.0 since standard where use only just after this tag
|
|
||||||
|
|
||||||
[[constraint]]
|
[[constraint]]
|
||||||
revision = "12dd70caea0268ac0d6c2707d0611ef601e7c64e"
|
revision = "12dd70caea0268ac0d6c2707d0611ef601e7c64e"
|
||||||
|
@ -108,11 +106,6 @@ ignored = ["google.golang.org/appengine*"]
|
||||||
name = "gopkg.in/testfixtures.v2"
|
name = "gopkg.in/testfixtures.v2"
|
||||||
version = "2.0.0"
|
version = "2.0.0"
|
||||||
|
|
||||||
[[override]]
|
|
||||||
name = "github.com/boltdb/bolt"
|
|
||||||
revision = "ccd680d8c1a0179ac3d68f692b01e1a1589cbfc7"
|
|
||||||
source = "github.com/go-gitea/bolt"
|
|
||||||
|
|
||||||
[[override]]
|
[[override]]
|
||||||
branch = "master"
|
branch = "master"
|
||||||
name = "golang.org/x/oauth2"
|
name = "golang.org/x/oauth2"
|
||||||
|
|
|
@ -1,22 +0,0 @@
|
||||||
The MIT License (MIT)
|
|
||||||
|
|
||||||
Copyright (c) 2015 Stephen Merity
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be included in all
|
|
||||||
copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
||||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
||||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
|
|
|
@ -1,229 +0,0 @@
|
||||||
package govarint
|
|
||||||
|
|
||||||
import "encoding/binary"
|
|
||||||
import "io"
|
|
||||||
|
|
||||||
type U32VarintEncoder interface {
|
|
||||||
PutU32(x uint32) int
|
|
||||||
Close()
|
|
||||||
}
|
|
||||||
|
|
||||||
type U32VarintDecoder interface {
|
|
||||||
GetU32() (uint32, error)
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type U64VarintEncoder interface {
|
|
||||||
PutU64(x uint64) int
|
|
||||||
Close()
|
|
||||||
}
|
|
||||||
|
|
||||||
type U64VarintDecoder interface {
|
|
||||||
GetU64() (uint64, error)
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type U32GroupVarintEncoder struct {
|
|
||||||
w io.Writer
|
|
||||||
index int
|
|
||||||
store [4]uint32
|
|
||||||
temp [17]byte
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewU32GroupVarintEncoder(w io.Writer) *U32GroupVarintEncoder { return &U32GroupVarintEncoder{w: w} }
|
|
||||||
|
|
||||||
func (b *U32GroupVarintEncoder) Flush() (int, error) {
|
|
||||||
// TODO: Is it more efficient to have a tailored version that's called only in Close()?
|
|
||||||
// If index is zero, there are no integers to flush
|
|
||||||
if b.index == 0 {
|
|
||||||
return 0, nil
|
|
||||||
}
|
|
||||||
// In the case we're flushing (the group isn't of size four), the non-values should be zero
|
|
||||||
// This ensures the unused entries are all zero in the sizeByte
|
|
||||||
for i := b.index; i < 4; i++ {
|
|
||||||
b.store[i] = 0
|
|
||||||
}
|
|
||||||
length := 1
|
|
||||||
// We need to reset the size byte to zero as we only bitwise OR into it, we don't overwrite it
|
|
||||||
b.temp[0] = 0
|
|
||||||
for i, x := range b.store {
|
|
||||||
size := byte(0)
|
|
||||||
shifts := []byte{24, 16, 8, 0}
|
|
||||||
for _, shift := range shifts {
|
|
||||||
// Always writes at least one byte -- the first one (shift = 0)
|
|
||||||
// Will write more bytes until the rest of the integer is all zeroes
|
|
||||||
if (x>>shift) != 0 || shift == 0 {
|
|
||||||
size += 1
|
|
||||||
b.temp[length] = byte(x >> shift)
|
|
||||||
length += 1
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// We store the size in two of the eight bits in the first byte (sizeByte)
|
|
||||||
// 0 means there is one byte in total, hence why we subtract one from size
|
|
||||||
b.temp[0] |= (size - 1) << (uint8(3-i) * 2)
|
|
||||||
}
|
|
||||||
// If we're flushing without a full group of four, remove the unused bytes we computed
|
|
||||||
// This enables us to realize it's a partial group on decoding thanks to EOF
|
|
||||||
if b.index != 4 {
|
|
||||||
length -= 4 - b.index
|
|
||||||
}
|
|
||||||
_, err := b.w.Write(b.temp[:length])
|
|
||||||
return length, err
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *U32GroupVarintEncoder) PutU32(x uint32) (int, error) {
|
|
||||||
bytesWritten := 0
|
|
||||||
b.store[b.index] = x
|
|
||||||
b.index += 1
|
|
||||||
if b.index == 4 {
|
|
||||||
n, err := b.Flush()
|
|
||||||
if err != nil {
|
|
||||||
return n, err
|
|
||||||
}
|
|
||||||
bytesWritten += n
|
|
||||||
b.index = 0
|
|
||||||
}
|
|
||||||
return bytesWritten, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *U32GroupVarintEncoder) Close() {
|
|
||||||
// On Close, we flush any remaining values that might not have been in a full group
|
|
||||||
b.Flush()
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type U32GroupVarintDecoder struct {
|
|
||||||
r io.ByteReader
|
|
||||||
group [4]uint32
|
|
||||||
pos int
|
|
||||||
finished bool
|
|
||||||
capacity int
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewU32GroupVarintDecoder(r io.ByteReader) *U32GroupVarintDecoder {
|
|
||||||
return &U32GroupVarintDecoder{r: r, pos: 4, capacity: 4}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *U32GroupVarintDecoder) getGroup() error {
|
|
||||||
// We should always receive a sizeByte if there are more values to read
|
|
||||||
sizeByte, err := b.r.ReadByte()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// Calculate the size of the four incoming 32 bit integers
|
|
||||||
// 0b00 means 1 byte to read, 0b01 = 2, etc
|
|
||||||
b.group[0] = uint32((sizeByte >> 6) & 3)
|
|
||||||
b.group[1] = uint32((sizeByte >> 4) & 3)
|
|
||||||
b.group[2] = uint32((sizeByte >> 2) & 3)
|
|
||||||
b.group[3] = uint32(sizeByte & 3)
|
|
||||||
//
|
|
||||||
for index, size := range b.group {
|
|
||||||
b.group[index] = 0
|
|
||||||
// Any error that occurs in earlier byte reads should be repeated at the end one
|
|
||||||
// Hence we only catch and report the final ReadByte's error
|
|
||||||
var err error
|
|
||||||
switch size {
|
|
||||||
case 0:
|
|
||||||
var x byte
|
|
||||||
x, err = b.r.ReadByte()
|
|
||||||
b.group[index] = uint32(x)
|
|
||||||
case 1:
|
|
||||||
var x, y byte
|
|
||||||
x, _ = b.r.ReadByte()
|
|
||||||
y, err = b.r.ReadByte()
|
|
||||||
b.group[index] = uint32(x)<<8 | uint32(y)
|
|
||||||
case 2:
|
|
||||||
var x, y, z byte
|
|
||||||
x, _ = b.r.ReadByte()
|
|
||||||
y, _ = b.r.ReadByte()
|
|
||||||
z, err = b.r.ReadByte()
|
|
||||||
b.group[index] = uint32(x)<<16 | uint32(y)<<8 | uint32(z)
|
|
||||||
case 3:
|
|
||||||
var x, y, z, zz byte
|
|
||||||
x, _ = b.r.ReadByte()
|
|
||||||
y, _ = b.r.ReadByte()
|
|
||||||
z, _ = b.r.ReadByte()
|
|
||||||
zz, err = b.r.ReadByte()
|
|
||||||
b.group[index] = uint32(x)<<24 | uint32(y)<<16 | uint32(z)<<8 | uint32(zz)
|
|
||||||
}
|
|
||||||
if err != nil {
|
|
||||||
if err == io.EOF {
|
|
||||||
// If we hit EOF here, we have found a partial group
|
|
||||||
// We've return any valid entries we have read and return EOF once we run out
|
|
||||||
b.capacity = index
|
|
||||||
b.finished = true
|
|
||||||
break
|
|
||||||
} else {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Reset the pos pointer to the beginning of the read values
|
|
||||||
b.pos = 0
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *U32GroupVarintDecoder) GetU32() (uint32, error) {
|
|
||||||
// Check if we have any more values to give out - if not, let's get them
|
|
||||||
if b.pos == b.capacity {
|
|
||||||
// If finished is set, there is nothing else to do
|
|
||||||
if b.finished {
|
|
||||||
return 0, io.EOF
|
|
||||||
}
|
|
||||||
err := b.getGroup()
|
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Increment pointer and return the value stored at that point
|
|
||||||
b.pos += 1
|
|
||||||
return b.group[b.pos-1], nil
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type Base128Encoder struct {
|
|
||||||
w io.Writer
|
|
||||||
tmpBytes []byte
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewU32Base128Encoder(w io.Writer) *Base128Encoder {
|
|
||||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen32)}
|
|
||||||
}
|
|
||||||
func NewU64Base128Encoder(w io.Writer) *Base128Encoder {
|
|
||||||
return &Base128Encoder{w: w, tmpBytes: make([]byte, binary.MaxVarintLen64)}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *Base128Encoder) PutU32(x uint32) (int, error) {
|
|
||||||
writtenBytes := binary.PutUvarint(b.tmpBytes, uint64(x))
|
|
||||||
return b.w.Write(b.tmpBytes[:writtenBytes])
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *Base128Encoder) PutU64(x uint64) (int, error) {
|
|
||||||
writtenBytes := binary.PutUvarint(b.tmpBytes, x)
|
|
||||||
return b.w.Write(b.tmpBytes[:writtenBytes])
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *Base128Encoder) Close() {
|
|
||||||
}
|
|
||||||
|
|
||||||
///
|
|
||||||
|
|
||||||
type Base128Decoder struct {
|
|
||||||
r io.ByteReader
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewU32Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} }
|
|
||||||
func NewU64Base128Decoder(r io.ByteReader) *Base128Decoder { return &Base128Decoder{r: r} }
|
|
||||||
|
|
||||||
func (b *Base128Decoder) GetU32() (uint32, error) {
|
|
||||||
v, err := binary.ReadUvarint(b.r)
|
|
||||||
return uint32(v), err
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b *Base128Decoder) GetU64() (uint64, error) {
|
|
||||||
return binary.ReadUvarint(b.r)
|
|
||||||
}
|
|
|
@ -14,6 +14,22 @@
|
||||||
|
|
||||||
package analysis
|
package analysis
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeTokenLocation int
|
||||||
|
var reflectStaticSizeTokenFreq int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var tl TokenLocation
|
||||||
|
reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size())
|
||||||
|
var tf TokenFreq
|
||||||
|
reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size())
|
||||||
|
}
|
||||||
|
|
||||||
// TokenLocation represents one occurrence of a term at a particular location in
|
// TokenLocation represents one occurrence of a term at a particular location in
|
||||||
// a field. Start, End and Position have the same meaning as in analysis.Token.
|
// a field. Start, End and Position have the same meaning as in analysis.Token.
|
||||||
// Field and ArrayPositions identify the field value in the source document.
|
// Field and ArrayPositions identify the field value in the source document.
|
||||||
|
@ -26,6 +42,12 @@ type TokenLocation struct {
|
||||||
Position int
|
Position int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (tl *TokenLocation) Size() int {
|
||||||
|
rv := reflectStaticSizeTokenLocation
|
||||||
|
rv += len(tl.ArrayPositions) * size.SizeOfUint64
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
// TokenFreq represents all the occurrences of a term in all fields of a
|
// TokenFreq represents all the occurrences of a term in all fields of a
|
||||||
// document.
|
// document.
|
||||||
type TokenFreq struct {
|
type TokenFreq struct {
|
||||||
|
@ -34,6 +56,15 @@ type TokenFreq struct {
|
||||||
frequency int
|
frequency int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (tf *TokenFreq) Size() int {
|
||||||
|
rv := reflectStaticSizeTokenFreq
|
||||||
|
rv += len(tf.Term)
|
||||||
|
for _, loc := range tf.Locations {
|
||||||
|
rv += loc.Size()
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
func (tf *TokenFreq) Frequency() int {
|
func (tf *TokenFreq) Frequency() int {
|
||||||
return tf.frequency
|
return tf.frequency
|
||||||
}
|
}
|
||||||
|
@ -42,6 +73,16 @@ func (tf *TokenFreq) Frequency() int {
|
||||||
// fields.
|
// fields.
|
||||||
type TokenFrequencies map[string]*TokenFreq
|
type TokenFrequencies map[string]*TokenFreq
|
||||||
|
|
||||||
|
func (tfs TokenFrequencies) Size() int {
|
||||||
|
rv := size.SizeOfMap
|
||||||
|
rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr)
|
||||||
|
for k, v := range tfs {
|
||||||
|
rv += len(k)
|
||||||
|
rv += v.Size()
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
|
func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
|
||||||
// walk the new token frequencies
|
// walk the new token frequencies
|
||||||
for tfk, tf := range other {
|
for tfk, tf := range other {
|
||||||
|
|
|
@ -46,11 +46,11 @@ type Parser struct {
|
||||||
index int
|
index int
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewParser(len, position, index int) *Parser {
|
func NewParser(length, position, index int) *Parser {
|
||||||
return &Parser{
|
return &Parser{
|
||||||
bufferLen: len,
|
bufferLen: length,
|
||||||
buffer: make([]rune, 0, len),
|
buffer: make([]rune, 0, length),
|
||||||
tokens: make([]*analysis.Token, 0, len),
|
tokens: make([]*analysis.Token, 0, length),
|
||||||
position: position,
|
position: position,
|
||||||
index: index,
|
index: index,
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,7 +21,7 @@ import (
|
||||||
|
|
||||||
const Name = "unique"
|
const Name = "unique"
|
||||||
|
|
||||||
// UniqueTermFilter retains only the tokens which mark the first occurence of
|
// UniqueTermFilter retains only the tokens which mark the first occurrence of
|
||||||
// a term. Tokens whose term appears in a preceding token are dropped.
|
// a term. Tokens whose term appears in a preceding token are dropped.
|
||||||
type UniqueTermFilter struct{}
|
type UniqueTermFilter struct{}
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,19 @@
|
||||||
|
|
||||||
package document
|
package document
|
||||||
|
|
||||||
import "fmt"
|
import (
|
||||||
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeDocument int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var d Document
|
||||||
|
reflectStaticSizeDocument = int(reflect.TypeOf(d).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type Document struct {
|
type Document struct {
|
||||||
ID string `json:"id"`
|
ID string `json:"id"`
|
||||||
|
@ -30,6 +42,21 @@ func NewDocument(id string) *Document {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (d *Document) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr +
|
||||||
|
len(d.ID)
|
||||||
|
|
||||||
|
for _, entry := range d.Fields {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range d.CompositeFields {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (d *Document) AddField(f Field) *Document {
|
func (d *Document) AddField(f Field) *Document {
|
||||||
switch f := f.(type) {
|
switch f := f.(type) {
|
||||||
case *CompositeField:
|
case *CompositeField:
|
||||||
|
|
|
@ -36,4 +36,6 @@ type Field interface {
|
||||||
// that this field represents - this is a common metric for tracking
|
// that this field represents - this is a common metric for tracking
|
||||||
// the rate of indexing
|
// the rate of indexing
|
||||||
NumPlainTextBytes() uint64
|
NumPlainTextBytes() uint64
|
||||||
|
|
||||||
|
Size() int
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,10 +16,19 @@ package document
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeBooleanField int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var f BooleanField
|
||||||
|
reflectStaticSizeBooleanField = int(reflect.TypeOf(f).Size())
|
||||||
|
}
|
||||||
|
|
||||||
const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues
|
const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues
|
||||||
|
|
||||||
type BooleanField struct {
|
type BooleanField struct {
|
||||||
|
@ -30,6 +39,13 @@ type BooleanField struct {
|
||||||
numPlainTextBytes uint64
|
numPlainTextBytes uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (b *BooleanField) Size() int {
|
||||||
|
return reflectStaticSizeBooleanField + size.SizeOfPtr +
|
||||||
|
len(b.name) +
|
||||||
|
len(b.arrayPositions)*size.SizeOfUint64 +
|
||||||
|
len(b.value)
|
||||||
|
}
|
||||||
|
|
||||||
func (b *BooleanField) Name() string {
|
func (b *BooleanField) Name() string {
|
||||||
return b.name
|
return b.name
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,9 +15,19 @@
|
||||||
package document
|
package document
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeCompositeField int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var cf CompositeField
|
||||||
|
reflectStaticSizeCompositeField = int(reflect.TypeOf(cf).Size())
|
||||||
|
}
|
||||||
|
|
||||||
const DefaultCompositeIndexingOptions = IndexField
|
const DefaultCompositeIndexingOptions = IndexField
|
||||||
|
|
||||||
type CompositeField struct {
|
type CompositeField struct {
|
||||||
|
@ -54,6 +64,21 @@ func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, incl
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *CompositeField) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr +
|
||||||
|
len(c.name)
|
||||||
|
|
||||||
|
for k, _ := range c.includedFields {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, _ := range c.excludedFields {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (c *CompositeField) Name() string {
|
func (c *CompositeField) Name() string {
|
||||||
return c.name
|
return c.name
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,12 +17,21 @@ package document
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"reflect"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
"github.com/blevesearch/bleve/numeric"
|
"github.com/blevesearch/bleve/numeric"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeDateTimeField int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var f DateTimeField
|
||||||
|
reflectStaticSizeDateTimeField = int(reflect.TypeOf(f).Size())
|
||||||
|
}
|
||||||
|
|
||||||
const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues
|
const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues
|
||||||
const DefaultDateTimePrecisionStep uint = 4
|
const DefaultDateTimePrecisionStep uint = 4
|
||||||
|
|
||||||
|
@ -37,6 +46,12 @@ type DateTimeField struct {
|
||||||
numPlainTextBytes uint64
|
numPlainTextBytes uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (n *DateTimeField) Size() int {
|
||||||
|
return reflectStaticSizeDateTimeField + size.SizeOfPtr +
|
||||||
|
len(n.name) +
|
||||||
|
len(n.arrayPositions)*size.SizeOfUint64
|
||||||
|
}
|
||||||
|
|
||||||
func (n *DateTimeField) Name() string {
|
func (n *DateTimeField) Name() string {
|
||||||
return n.name
|
return n.name
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,12 +16,21 @@ package document
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
"github.com/blevesearch/bleve/geo"
|
"github.com/blevesearch/bleve/geo"
|
||||||
"github.com/blevesearch/bleve/numeric"
|
"github.com/blevesearch/bleve/numeric"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeGeoPointField int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var f GeoPointField
|
||||||
|
reflectStaticSizeGeoPointField = int(reflect.TypeOf(f).Size())
|
||||||
|
}
|
||||||
|
|
||||||
var GeoPrecisionStep uint = 9
|
var GeoPrecisionStep uint = 9
|
||||||
|
|
||||||
type GeoPointField struct {
|
type GeoPointField struct {
|
||||||
|
@ -32,6 +41,12 @@ type GeoPointField struct {
|
||||||
numPlainTextBytes uint64
|
numPlainTextBytes uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (n *GeoPointField) Size() int {
|
||||||
|
return reflectStaticSizeGeoPointField + size.SizeOfPtr +
|
||||||
|
len(n.name) +
|
||||||
|
len(n.arrayPositions)*size.SizeOfUint64
|
||||||
|
}
|
||||||
|
|
||||||
func (n *GeoPointField) Name() string {
|
func (n *GeoPointField) Name() string {
|
||||||
return n.name
|
return n.name
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,11 +16,20 @@ package document
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
"github.com/blevesearch/bleve/numeric"
|
"github.com/blevesearch/bleve/numeric"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeNumericField int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var f NumericField
|
||||||
|
reflectStaticSizeNumericField = int(reflect.TypeOf(f).Size())
|
||||||
|
}
|
||||||
|
|
||||||
const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues
|
const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues
|
||||||
|
|
||||||
const DefaultPrecisionStep uint = 4
|
const DefaultPrecisionStep uint = 4
|
||||||
|
@ -33,6 +42,12 @@ type NumericField struct {
|
||||||
numPlainTextBytes uint64
|
numPlainTextBytes uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (n *NumericField) Size() int {
|
||||||
|
return reflectStaticSizeNumericField + size.SizeOfPtr +
|
||||||
|
len(n.name) +
|
||||||
|
len(n.arrayPositions)*size.SizeOfPtr
|
||||||
|
}
|
||||||
|
|
||||||
func (n *NumericField) Name() string {
|
func (n *NumericField) Name() string {
|
||||||
return n.name
|
return n.name
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,10 +16,19 @@ package document
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeTextField int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var f TextField
|
||||||
|
reflectStaticSizeTextField = int(reflect.TypeOf(f).Size())
|
||||||
|
}
|
||||||
|
|
||||||
const DefaultTextIndexingOptions = IndexField | DocValues
|
const DefaultTextIndexingOptions = IndexField | DocValues
|
||||||
|
|
||||||
type TextField struct {
|
type TextField struct {
|
||||||
|
@ -31,6 +40,13 @@ type TextField struct {
|
||||||
numPlainTextBytes uint64
|
numPlainTextBytes uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *TextField) Size() int {
|
||||||
|
return reflectStaticSizeTextField + size.SizeOfPtr +
|
||||||
|
len(t.name) +
|
||||||
|
len(t.arrayPositions)*size.SizeOfUint64 +
|
||||||
|
len(t.value)
|
||||||
|
}
|
||||||
|
|
||||||
func (t *TextField) Name() string {
|
func (t *TextField) Name() string {
|
||||||
return t.name
|
return t.name
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,174 @@
|
||||||
|
// The code here was obtained from:
|
||||||
|
// https://github.com/mmcloughlin/geohash
|
||||||
|
|
||||||
|
// The MIT License (MIT)
|
||||||
|
// Copyright (c) 2015 Michael McLoughlin
|
||||||
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
// of this software and associated documentation files (the "Software"), to deal
|
||||||
|
// in the Software without restriction, including without limitation the rights
|
||||||
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
// copies of the Software, and to permit persons to whom the Software is
|
||||||
|
// furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
// The above copyright notice and this permission notice shall be included in all
|
||||||
|
// copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
// SOFTWARE.
|
||||||
|
|
||||||
|
package geo
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
)
|
||||||
|
|
||||||
|
// encoding encapsulates an encoding defined by a given base32 alphabet.
|
||||||
|
type encoding struct {
|
||||||
|
enc string
|
||||||
|
dec [256]byte
|
||||||
|
}
|
||||||
|
|
||||||
|
// newEncoding constructs a new encoding defined by the given alphabet,
|
||||||
|
// which must be a 32-byte string.
|
||||||
|
func newEncoding(encoder string) *encoding {
|
||||||
|
e := new(encoding)
|
||||||
|
e.enc = encoder
|
||||||
|
for i := 0; i < len(e.dec); i++ {
|
||||||
|
e.dec[i] = 0xff
|
||||||
|
}
|
||||||
|
for i := 0; i < len(encoder); i++ {
|
||||||
|
e.dec[encoder[i]] = byte(i)
|
||||||
|
}
|
||||||
|
return e
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode string into bits of a 64-bit word. The string s may be at most 12
|
||||||
|
// characters.
|
||||||
|
func (e *encoding) decode(s string) uint64 {
|
||||||
|
x := uint64(0)
|
||||||
|
for i := 0; i < len(s); i++ {
|
||||||
|
x = (x << 5) | uint64(e.dec[s[i]])
|
||||||
|
}
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encode bits of 64-bit word into a string.
|
||||||
|
func (e *encoding) encode(x uint64) string {
|
||||||
|
b := [12]byte{}
|
||||||
|
for i := 0; i < 12; i++ {
|
||||||
|
b[11-i] = e.enc[x&0x1f]
|
||||||
|
x >>= 5
|
||||||
|
}
|
||||||
|
return string(b[:])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Base32Encoding with the Geohash alphabet.
|
||||||
|
var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz")
|
||||||
|
|
||||||
|
// BoundingBox returns the region encoded by the given string geohash.
|
||||||
|
func geoBoundingBox(hash string) geoBox {
|
||||||
|
bits := uint(5 * len(hash))
|
||||||
|
inthash := base32encoding.decode(hash)
|
||||||
|
return geoBoundingBoxIntWithPrecision(inthash, bits)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Box represents a rectangle in latitude/longitude space.
|
||||||
|
type geoBox struct {
|
||||||
|
minLat float64
|
||||||
|
maxLat float64
|
||||||
|
minLng float64
|
||||||
|
maxLng float64
|
||||||
|
}
|
||||||
|
|
||||||
|
// Round returns a point inside the box, making an effort to round to minimal
|
||||||
|
// precision.
|
||||||
|
func (b geoBox) round() (lat, lng float64) {
|
||||||
|
x := maxDecimalPower(b.maxLat - b.minLat)
|
||||||
|
lat = math.Ceil(b.minLat/x) * x
|
||||||
|
x = maxDecimalPower(b.maxLng - b.minLng)
|
||||||
|
lng = math.Ceil(b.minLng/x) * x
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// precalculated for performance
|
||||||
|
var exp232 = math.Exp2(32)
|
||||||
|
|
||||||
|
// errorWithPrecision returns the error range in latitude and longitude for in
|
||||||
|
// integer geohash with bits of precision.
|
||||||
|
func errorWithPrecision(bits uint) (latErr, lngErr float64) {
|
||||||
|
b := int(bits)
|
||||||
|
latBits := b / 2
|
||||||
|
lngBits := b - latBits
|
||||||
|
latErr = math.Ldexp(180.0, -latBits)
|
||||||
|
lngErr = math.Ldexp(360.0, -lngBits)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// minDecimalPlaces returns the minimum number of decimal places such that
|
||||||
|
// there must exist an number with that many places within any range of width
|
||||||
|
// r. This is intended for returning minimal precision coordinates inside a
|
||||||
|
// box.
|
||||||
|
func maxDecimalPower(r float64) float64 {
|
||||||
|
m := int(math.Floor(math.Log10(r)))
|
||||||
|
return math.Pow10(m)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Encode the position of x within the range -r to +r as a 32-bit integer.
|
||||||
|
func encodeRange(x, r float64) uint32 {
|
||||||
|
p := (x + r) / (2 * r)
|
||||||
|
return uint32(p * exp232)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decode the 32-bit range encoding X back to a value in the range -r to +r.
|
||||||
|
func decodeRange(X uint32, r float64) float64 {
|
||||||
|
p := float64(X) / exp232
|
||||||
|
x := 2*r*p - r
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
|
||||||
|
// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are
|
||||||
|
// ignored, and may take any value.
|
||||||
|
func squash(X uint64) uint32 {
|
||||||
|
X &= 0x5555555555555555
|
||||||
|
X = (X | (X >> 1)) & 0x3333333333333333
|
||||||
|
X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f
|
||||||
|
X = (X | (X >> 4)) & 0x00ff00ff00ff00ff
|
||||||
|
X = (X | (X >> 8)) & 0x0000ffff0000ffff
|
||||||
|
X = (X | (X >> 16)) & 0x00000000ffffffff
|
||||||
|
return uint32(X)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deinterleave the bits of X into 32-bit words containing the even and odd
|
||||||
|
// bitlevels of X, respectively.
|
||||||
|
func deinterleave(X uint64) (uint32, uint32) {
|
||||||
|
return squash(X), squash(X >> 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// BoundingBoxIntWithPrecision returns the region encoded by the integer
|
||||||
|
// geohash with the specified precision.
|
||||||
|
func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox {
|
||||||
|
fullHash := hash << (64 - bits)
|
||||||
|
latInt, lngInt := deinterleave(fullHash)
|
||||||
|
lat := decodeRange(latInt, 90)
|
||||||
|
lng := decodeRange(lngInt, 180)
|
||||||
|
latErr, lngErr := errorWithPrecision(bits)
|
||||||
|
return geoBox{
|
||||||
|
minLat: lat,
|
||||||
|
maxLat: lat + latErr,
|
||||||
|
minLng: lng,
|
||||||
|
maxLng: lng + lngErr,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
|
||||||
|
// Decode the string geohash to a (lat, lng) point.
|
||||||
|
func GeoHashDecode(hash string) (lat, lng float64) {
|
||||||
|
box := geoBoundingBox(hash)
|
||||||
|
return box.round()
|
||||||
|
}
|
|
@ -16,6 +16,7 @@ package geo
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -24,6 +25,8 @@ import (
|
||||||
// Container:
|
// Container:
|
||||||
// slice length 2 (GeoJSON)
|
// slice length 2 (GeoJSON)
|
||||||
// first element lon, second element lat
|
// first element lon, second element lat
|
||||||
|
// string (coordinates separated by comma, or a geohash)
|
||||||
|
// first element lat, second element lon
|
||||||
// map[string]interface{}
|
// map[string]interface{}
|
||||||
// exact keys lat and lon or lng
|
// exact keys lat and lon or lng
|
||||||
// struct
|
// struct
|
||||||
|
@ -36,10 +39,14 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
|
||||||
var foundLon, foundLat bool
|
var foundLon, foundLat bool
|
||||||
|
|
||||||
thingVal := reflect.ValueOf(thing)
|
thingVal := reflect.ValueOf(thing)
|
||||||
|
if !thingVal.IsValid() {
|
||||||
|
return lon, lat, false
|
||||||
|
}
|
||||||
|
|
||||||
thingTyp := thingVal.Type()
|
thingTyp := thingVal.Type()
|
||||||
|
|
||||||
// is it a slice
|
// is it a slice
|
||||||
if thingVal.IsValid() && thingVal.Kind() == reflect.Slice {
|
if thingVal.Kind() == reflect.Slice {
|
||||||
// must be length 2
|
// must be length 2
|
||||||
if thingVal.Len() == 2 {
|
if thingVal.Len() == 2 {
|
||||||
first := thingVal.Index(0)
|
first := thingVal.Index(0)
|
||||||
|
@ -55,6 +62,35 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// is it a string
|
||||||
|
if thingVal.Kind() == reflect.String {
|
||||||
|
geoStr := thingVal.Interface().(string)
|
||||||
|
if strings.Contains(geoStr, ",") {
|
||||||
|
// geo point with coordinates split by comma
|
||||||
|
points := strings.Split(geoStr, ",")
|
||||||
|
for i, point := range points {
|
||||||
|
// trim any leading or trailing white spaces
|
||||||
|
points[i] = strings.TrimSpace(point)
|
||||||
|
}
|
||||||
|
if len(points) == 2 {
|
||||||
|
var err error
|
||||||
|
lat, err = strconv.ParseFloat(points[0], 64)
|
||||||
|
if err == nil {
|
||||||
|
foundLat = true
|
||||||
|
}
|
||||||
|
lon, err = strconv.ParseFloat(points[1], 64)
|
||||||
|
if err == nil {
|
||||||
|
foundLon = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// geohash
|
||||||
|
lat, lon = GeoHashDecode(geoStr)
|
||||||
|
foundLat = true
|
||||||
|
foundLon = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// is it a map
|
// is it a map
|
||||||
if l, ok := thing.(map[string]interface{}); ok {
|
if l, ok := thing.(map[string]interface{}); ok {
|
||||||
if lval, ok := l["lon"]; ok {
|
if lval, ok := l["lon"]; ok {
|
||||||
|
@ -68,7 +104,7 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// now try reflection on struct fields
|
// now try reflection on struct fields
|
||||||
if thingVal.IsValid() && thingVal.Kind() == reflect.Struct {
|
if thingVal.Kind() == reflect.Struct {
|
||||||
for i := 0; i < thingVal.NumField(); i++ {
|
for i := 0; i < thingVal.NumField(); i++ {
|
||||||
fieldName := thingTyp.Field(i).Name
|
fieldName := thingTyp.Field(i).Name
|
||||||
if strings.HasPrefix(strings.ToLower(fieldName), "lon") {
|
if strings.HasPrefix(strings.ToLower(fieldName), "lon") {
|
||||||
|
@ -113,6 +149,9 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) {
|
||||||
// extract numeric value (if possible) and returns a float64
|
// extract numeric value (if possible) and returns a float64
|
||||||
func extractNumericVal(v interface{}) (float64, bool) {
|
func extractNumericVal(v interface{}) (float64, bool) {
|
||||||
val := reflect.ValueOf(v)
|
val := reflect.ValueOf(v)
|
||||||
|
if !val.IsValid() {
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
typ := val.Type()
|
typ := val.Type()
|
||||||
switch typ.Kind() {
|
switch typ.Kind() {
|
||||||
case reflect.Float32, reflect.Float64:
|
case reflect.Float32, reflect.Float64:
|
||||||
|
|
|
@ -21,6 +21,7 @@ import (
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/store"
|
"github.com/blevesearch/bleve/index/store"
|
||||||
"github.com/blevesearch/bleve/mapping"
|
"github.com/blevesearch/bleve/mapping"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
// A Batch groups together multiple Index and Delete
|
// A Batch groups together multiple Index and Delete
|
||||||
|
@ -32,6 +33,9 @@ import (
|
||||||
type Batch struct {
|
type Batch struct {
|
||||||
index Index
|
index Index
|
||||||
internal *index.Batch
|
internal *index.Batch
|
||||||
|
|
||||||
|
lastDocSize uint64
|
||||||
|
totalSize uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
// Index adds the specified index operation to the
|
// Index adds the specified index operation to the
|
||||||
|
@ -47,9 +51,22 @@ func (b *Batch) Index(id string, data interface{}) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
b.internal.Update(doc)
|
b.internal.Update(doc)
|
||||||
|
|
||||||
|
b.lastDocSize = uint64(doc.Size() +
|
||||||
|
len(id) + size.SizeOfString) // overhead from internal
|
||||||
|
b.totalSize += b.lastDocSize
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (b *Batch) LastDocSize() uint64 {
|
||||||
|
return b.lastDocSize
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Batch) TotalDocsSize() uint64 {
|
||||||
|
return b.totalSize
|
||||||
|
}
|
||||||
|
|
||||||
// IndexAdvanced adds the specified index operation to the
|
// IndexAdvanced adds the specified index operation to the
|
||||||
// batch which skips the mapping. NOTE: the bleve Index is not updated
|
// batch which skips the mapping. NOTE: the bleve Index is not updated
|
||||||
// until the batch is executed.
|
// until the batch is executed.
|
||||||
|
@ -102,6 +119,24 @@ func (b *Batch) Reset() {
|
||||||
b.internal.Reset()
|
b.internal.Reset()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (b *Batch) Merge(o *Batch) {
|
||||||
|
if o != nil && o.internal != nil {
|
||||||
|
b.internal.Merge(o.internal)
|
||||||
|
if o.LastDocSize() > 0 {
|
||||||
|
b.lastDocSize = o.LastDocSize()
|
||||||
|
}
|
||||||
|
b.totalSize = uint64(b.internal.TotalDocSize())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Batch) SetPersistedCallback(f index.BatchCallback) {
|
||||||
|
b.internal.SetPersistedCallback(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Batch) PersistedCallback() index.BatchCallback {
|
||||||
|
return b.internal.PersistedCallback()
|
||||||
|
}
|
||||||
|
|
||||||
// An Index implements all the indexing and searching
|
// An Index implements all the indexing and searching
|
||||||
// capabilities of bleve. An Index can be created
|
// capabilities of bleve. An Index can be created
|
||||||
// using the New() and Open() methods.
|
// using the New() and Open() methods.
|
||||||
|
|
|
@ -15,10 +15,20 @@
|
||||||
package index
|
package index
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
"github.com/blevesearch/bleve/document"
|
"github.com/blevesearch/bleve/document"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeAnalysisResult int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var ar AnalysisResult
|
||||||
|
reflectStaticSizeAnalysisResult = int(reflect.TypeOf(ar).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type IndexRow interface {
|
type IndexRow interface {
|
||||||
KeySize() int
|
KeySize() int
|
||||||
KeyTo([]byte) (int, error)
|
KeyTo([]byte) (int, error)
|
||||||
|
@ -39,6 +49,15 @@ type AnalysisResult struct {
|
||||||
Length []int
|
Length []int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (a *AnalysisResult) Size() int {
|
||||||
|
rv := reflectStaticSizeAnalysisResult
|
||||||
|
for _, analyzedI := range a.Analyzed {
|
||||||
|
rv += analyzedI.Size()
|
||||||
|
}
|
||||||
|
rv += len(a.Length) * size.SizeOfInt
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
type AnalysisWork struct {
|
type AnalysisWork struct {
|
||||||
i Index
|
i Index
|
||||||
d *document.Document
|
d *document.Document
|
||||||
|
|
|
@ -18,11 +18,23 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/document"
|
"github.com/blevesearch/bleve/document"
|
||||||
"github.com/blevesearch/bleve/index/store"
|
"github.com/blevesearch/bleve/index/store"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeTermFieldDoc int
|
||||||
|
var reflectStaticSizeTermFieldVector int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var tfd TermFieldDoc
|
||||||
|
reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size())
|
||||||
|
var tfv TermFieldVector
|
||||||
|
reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size())
|
||||||
|
}
|
||||||
|
|
||||||
var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
|
var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
|
||||||
|
|
||||||
type Index interface {
|
type Index interface {
|
||||||
|
@ -68,6 +80,8 @@ type IndexReader interface {
|
||||||
Document(id string) (*document.Document, error)
|
Document(id string) (*document.Document, error)
|
||||||
DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
|
DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
|
||||||
|
|
||||||
|
DocValueReader(fields []string) (DocValueReader, error)
|
||||||
|
|
||||||
Fields() ([]string, error)
|
Fields() ([]string, error)
|
||||||
|
|
||||||
GetInternal(key []byte) ([]byte, error)
|
GetInternal(key []byte) ([]byte, error)
|
||||||
|
@ -84,6 +98,29 @@ type IndexReader interface {
|
||||||
Close() error
|
Close() error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// The Regexp interface defines the subset of the regexp.Regexp API
|
||||||
|
// methods that are used by bleve indexes, allowing callers to pass in
|
||||||
|
// alternate implementations.
|
||||||
|
type Regexp interface {
|
||||||
|
FindStringIndex(s string) (loc []int)
|
||||||
|
|
||||||
|
LiteralPrefix() (prefix string, complete bool)
|
||||||
|
|
||||||
|
String() string
|
||||||
|
}
|
||||||
|
|
||||||
|
type IndexReaderRegexp interface {
|
||||||
|
FieldDictRegexp(field string, regex string) (FieldDict, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type IndexReaderFuzzy interface {
|
||||||
|
FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type IndexReaderOnly interface {
|
||||||
|
FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
|
||||||
|
}
|
||||||
|
|
||||||
// FieldTerms contains the terms used by a document, keyed by field
|
// FieldTerms contains the terms used by a document, keyed by field
|
||||||
type FieldTerms map[string][]string
|
type FieldTerms map[string][]string
|
||||||
|
|
||||||
|
@ -115,6 +152,11 @@ type TermFieldVector struct {
|
||||||
End uint64
|
End uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (tfv *TermFieldVector) Size() int {
|
||||||
|
return reflectStaticSizeTermFieldVector + size.SizeOfPtr +
|
||||||
|
len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64
|
||||||
|
}
|
||||||
|
|
||||||
// IndexInternalID is an opaque document identifier interal to the index impl
|
// IndexInternalID is an opaque document identifier interal to the index impl
|
||||||
type IndexInternalID []byte
|
type IndexInternalID []byte
|
||||||
|
|
||||||
|
@ -134,14 +176,27 @@ type TermFieldDoc struct {
|
||||||
Vectors []*TermFieldVector
|
Vectors []*TermFieldVector
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (tfd *TermFieldDoc) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr +
|
||||||
|
len(tfd.Term) + len(tfd.ID)
|
||||||
|
|
||||||
|
for _, entry := range tfd.Vectors {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
// Reset allows an already allocated TermFieldDoc to be reused
|
// Reset allows an already allocated TermFieldDoc to be reused
|
||||||
func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
|
func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
|
||||||
// remember the []byte used for the ID
|
// remember the []byte used for the ID
|
||||||
id := tfd.ID
|
id := tfd.ID
|
||||||
|
vectors := tfd.Vectors
|
||||||
// idiom to copy over from empty TermFieldDoc (0 allocations)
|
// idiom to copy over from empty TermFieldDoc (0 allocations)
|
||||||
*tfd = TermFieldDoc{}
|
*tfd = TermFieldDoc{}
|
||||||
// reuse the []byte already allocated (and reset len to 0)
|
// reuse the []byte already allocated (and reset len to 0)
|
||||||
tfd.ID = id[:0]
|
tfd.ID = id[:0]
|
||||||
|
tfd.Vectors = vectors[:0]
|
||||||
return tfd
|
return tfd
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,6 +216,8 @@ type TermFieldReader interface {
|
||||||
// Count returns the number of documents contains the term in this field.
|
// Count returns the number of documents contains the term in this field.
|
||||||
Count() uint64
|
Count() uint64
|
||||||
Close() error
|
Close() error
|
||||||
|
|
||||||
|
Size() int
|
||||||
}
|
}
|
||||||
|
|
||||||
type DictEntry struct {
|
type DictEntry struct {
|
||||||
|
@ -185,12 +242,18 @@ type DocIDReader interface {
|
||||||
// will start there instead. If ID is greater than or equal to the end of
|
// will start there instead. If ID is greater than or equal to the end of
|
||||||
// the range, Next() call will return io.EOF.
|
// the range, Next() call will return io.EOF.
|
||||||
Advance(ID IndexInternalID) (IndexInternalID, error)
|
Advance(ID IndexInternalID) (IndexInternalID, error)
|
||||||
|
|
||||||
|
Size() int
|
||||||
|
|
||||||
Close() error
|
Close() error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type BatchCallback func(error)
|
||||||
|
|
||||||
type Batch struct {
|
type Batch struct {
|
||||||
IndexOps map[string]*document.Document
|
IndexOps map[string]*document.Document
|
||||||
InternalOps map[string][]byte
|
InternalOps map[string][]byte
|
||||||
|
persistedCallback BatchCallback
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewBatch() *Batch {
|
func NewBatch() *Batch {
|
||||||
|
@ -216,6 +279,14 @@ func (b *Batch) DeleteInternal(key []byte) {
|
||||||
b.InternalOps[string(key)] = nil
|
b.InternalOps[string(key)] = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (b *Batch) SetPersistedCallback(f BatchCallback) {
|
||||||
|
b.persistedCallback = f
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Batch) PersistedCallback() BatchCallback {
|
||||||
|
return b.persistedCallback
|
||||||
|
}
|
||||||
|
|
||||||
func (b *Batch) String() string {
|
func (b *Batch) String() string {
|
||||||
rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
|
rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
|
||||||
for k, v := range b.IndexOps {
|
for k, v := range b.IndexOps {
|
||||||
|
@ -238,4 +309,53 @@ func (b *Batch) String() string {
|
||||||
func (b *Batch) Reset() {
|
func (b *Batch) Reset() {
|
||||||
b.IndexOps = make(map[string]*document.Document)
|
b.IndexOps = make(map[string]*document.Document)
|
||||||
b.InternalOps = make(map[string][]byte)
|
b.InternalOps = make(map[string][]byte)
|
||||||
|
b.persistedCallback = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Batch) Merge(o *Batch) {
|
||||||
|
for k, v := range o.IndexOps {
|
||||||
|
b.IndexOps[k] = v
|
||||||
|
}
|
||||||
|
for k, v := range o.InternalOps {
|
||||||
|
b.InternalOps[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b *Batch) TotalDocSize() int {
|
||||||
|
var s int
|
||||||
|
for k, v := range b.IndexOps {
|
||||||
|
if v != nil {
|
||||||
|
s += v.Size() + size.SizeOfString
|
||||||
|
}
|
||||||
|
s += len(k)
|
||||||
|
}
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
// Optimizable represents an optional interface that implementable by
|
||||||
|
// optimizable resources (e.g., TermFieldReaders, Searchers). These
|
||||||
|
// optimizable resources are provided the same OptimizableContext
|
||||||
|
// instance, so that they can coordinate via dynamic interface
|
||||||
|
// casting.
|
||||||
|
type Optimizable interface {
|
||||||
|
Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Represents a result of optimization -- see the Finish() method.
|
||||||
|
type Optimized interface{}
|
||||||
|
|
||||||
|
type OptimizableContext interface {
|
||||||
|
// Once all the optimzable resources have been provided the same
|
||||||
|
// OptimizableContext instance, the optimization preparations are
|
||||||
|
// finished or completed via the Finish() method.
|
||||||
|
//
|
||||||
|
// Depending on the optimization being performed, the Finish()
|
||||||
|
// method might return a non-nil Optimized instance. For example,
|
||||||
|
// the Optimized instance might represent an optimized
|
||||||
|
// TermFieldReader instance.
|
||||||
|
Finish() (Optimized, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type DocValueReader interface {
|
||||||
|
VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,9 @@ import (
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
|
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||||
)
|
)
|
||||||
|
|
||||||
type segmentIntroduction struct {
|
type segmentIntroduction struct {
|
||||||
|
@ -31,6 +33,12 @@ type segmentIntroduction struct {
|
||||||
|
|
||||||
applied chan error
|
applied chan error
|
||||||
persisted chan error
|
persisted chan error
|
||||||
|
persistedCallback index.BatchCallback
|
||||||
|
}
|
||||||
|
|
||||||
|
type persistIntroduction struct {
|
||||||
|
persisted map[uint64]segment.Segment
|
||||||
|
applied notificationChan
|
||||||
}
|
}
|
||||||
|
|
||||||
type epochWatcher struct {
|
type epochWatcher struct {
|
||||||
|
@ -48,6 +56,8 @@ func (s *Scorch) mainLoop() {
|
||||||
var epochWatchers []*epochWatcher
|
var epochWatchers []*epochWatcher
|
||||||
OUTER:
|
OUTER:
|
||||||
for {
|
for {
|
||||||
|
atomic.AddUint64(&s.stats.TotIntroduceLoop, 1)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
break OUTER
|
break OUTER
|
||||||
|
@ -64,6 +74,9 @@ OUTER:
|
||||||
continue OUTER
|
continue OUTER
|
||||||
}
|
}
|
||||||
|
|
||||||
|
case persist := <-s.persists:
|
||||||
|
s.introducePersist(persist)
|
||||||
|
|
||||||
case revertTo := <-s.revertToSnapshots:
|
case revertTo := <-s.revertToSnapshots:
|
||||||
err := s.revertToSnapshot(revertTo)
|
err := s.revertToSnapshot(revertTo)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -92,32 +105,38 @@ OUTER:
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
||||||
// acquire lock
|
atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1)
|
||||||
s.rootLock.Lock()
|
defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1)
|
||||||
|
|
||||||
nsegs := len(s.root.segment)
|
s.rootLock.RLock()
|
||||||
|
root := s.root
|
||||||
|
root.AddRef()
|
||||||
|
s.rootLock.RUnlock()
|
||||||
|
|
||||||
|
defer func() { _ = root.DecRef() }()
|
||||||
|
|
||||||
|
nsegs := len(root.segment)
|
||||||
|
|
||||||
// prepare new index snapshot
|
// prepare new index snapshot
|
||||||
newSnapshot := &IndexSnapshot{
|
newSnapshot := &IndexSnapshot{
|
||||||
parent: s,
|
parent: s,
|
||||||
segment: make([]*SegmentSnapshot, 0, nsegs+1),
|
segment: make([]*SegmentSnapshot, 0, nsegs+1),
|
||||||
offsets: make([]uint64, 0, nsegs+1),
|
offsets: make([]uint64, 0, nsegs+1),
|
||||||
internal: make(map[string][]byte, len(s.root.internal)),
|
internal: make(map[string][]byte, len(root.internal)),
|
||||||
epoch: s.nextSnapshotEpoch,
|
|
||||||
refs: 1,
|
refs: 1,
|
||||||
|
creator: "introduceSegment",
|
||||||
}
|
}
|
||||||
s.nextSnapshotEpoch++
|
|
||||||
|
|
||||||
// iterate through current segments
|
// iterate through current segments
|
||||||
var running uint64
|
var running uint64
|
||||||
for i := range s.root.segment {
|
var docsToPersistCount, memSegments, fileSegments uint64
|
||||||
|
for i := range root.segment {
|
||||||
// see if optimistic work included this segment
|
// see if optimistic work included this segment
|
||||||
delta, ok := next.obsoletes[s.root.segment[i].id]
|
delta, ok := next.obsoletes[root.segment[i].id]
|
||||||
if !ok {
|
if !ok {
|
||||||
var err error
|
var err error
|
||||||
delta, err = s.root.segment[i].segment.DocNumbers(next.ids)
|
delta, err = root.segment[i].segment.DocNumbers(next.ids)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.rootLock.Unlock()
|
|
||||||
next.applied <- fmt.Errorf("error computing doc numbers: %v", err)
|
next.applied <- fmt.Errorf("error computing doc numbers: %v", err)
|
||||||
close(next.applied)
|
close(next.applied)
|
||||||
_ = newSnapshot.DecRef()
|
_ = newSnapshot.DecRef()
|
||||||
|
@ -126,43 +145,60 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
||||||
}
|
}
|
||||||
|
|
||||||
newss := &SegmentSnapshot{
|
newss := &SegmentSnapshot{
|
||||||
id: s.root.segment[i].id,
|
id: root.segment[i].id,
|
||||||
segment: s.root.segment[i].segment,
|
segment: root.segment[i].segment,
|
||||||
cachedDocs: s.root.segment[i].cachedDocs,
|
cachedDocs: root.segment[i].cachedDocs,
|
||||||
|
creator: root.segment[i].creator,
|
||||||
}
|
}
|
||||||
|
|
||||||
// apply new obsoletions
|
// apply new obsoletions
|
||||||
if s.root.segment[i].deleted == nil {
|
if root.segment[i].deleted == nil {
|
||||||
newss.deleted = delta
|
newss.deleted = delta
|
||||||
} else {
|
} else {
|
||||||
newss.deleted = roaring.Or(s.root.segment[i].deleted, delta)
|
newss.deleted = roaring.Or(root.segment[i].deleted, delta)
|
||||||
|
}
|
||||||
|
if newss.deleted.IsEmpty() {
|
||||||
|
newss.deleted = nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// check for live size before copying
|
// check for live size before copying
|
||||||
if newss.LiveSize() > 0 {
|
if newss.LiveSize() > 0 {
|
||||||
newSnapshot.segment = append(newSnapshot.segment, newss)
|
newSnapshot.segment = append(newSnapshot.segment, newss)
|
||||||
s.root.segment[i].segment.AddRef()
|
root.segment[i].segment.AddRef()
|
||||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||||
running += s.root.segment[i].Count()
|
running += newss.segment.Count()
|
||||||
|
}
|
||||||
|
|
||||||
|
if isMemorySegment(root.segment[i]) {
|
||||||
|
docsToPersistCount += root.segment[i].Count()
|
||||||
|
memSegments++
|
||||||
|
} else {
|
||||||
|
fileSegments++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||||
|
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||||
|
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||||
|
|
||||||
// append new segment, if any, to end of the new index snapshot
|
// append new segment, if any, to end of the new index snapshot
|
||||||
if next.data != nil {
|
if next.data != nil {
|
||||||
newSegmentSnapshot := &SegmentSnapshot{
|
newSegmentSnapshot := &SegmentSnapshot{
|
||||||
id: next.id,
|
id: next.id,
|
||||||
segment: next.data, // take ownership of next.data's ref-count
|
segment: next.data, // take ownership of next.data's ref-count
|
||||||
cachedDocs: &cachedDocs{cache: nil},
|
cachedDocs: &cachedDocs{cache: nil},
|
||||||
|
creator: "introduceSegment",
|
||||||
}
|
}
|
||||||
newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot)
|
newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot)
|
||||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||||
|
|
||||||
// increment numItemsIntroduced which tracks the number of items
|
// increment numItemsIntroduced which tracks the number of items
|
||||||
// queued for persistence.
|
// queued for persistence.
|
||||||
atomic.AddUint64(&s.stats.numItemsIntroduced, newSegmentSnapshot.Count())
|
atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count())
|
||||||
|
atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1)
|
||||||
}
|
}
|
||||||
// copy old values
|
// copy old values
|
||||||
for key, oldVal := range s.root.internal {
|
for key, oldVal := range root.internal {
|
||||||
newSnapshot.internal[key] = oldVal
|
newSnapshot.internal[key] = oldVal
|
||||||
}
|
}
|
||||||
// set new values and apply deletes
|
// set new values and apply deletes
|
||||||
|
@ -173,12 +209,21 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
||||||
delete(newSnapshot.internal, key)
|
delete(newSnapshot.internal, key)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
newSnapshot.updateSize()
|
||||||
|
s.rootLock.Lock()
|
||||||
if next.persisted != nil {
|
if next.persisted != nil {
|
||||||
s.rootPersisted = append(s.rootPersisted, next.persisted)
|
s.rootPersisted = append(s.rootPersisted, next.persisted)
|
||||||
}
|
}
|
||||||
|
if next.persistedCallback != nil {
|
||||||
|
s.persistedCallbacks = append(s.persistedCallbacks, next.persistedCallback)
|
||||||
|
}
|
||||||
// swap in new index snapshot
|
// swap in new index snapshot
|
||||||
|
newSnapshot.epoch = s.nextSnapshotEpoch
|
||||||
|
s.nextSnapshotEpoch++
|
||||||
rootPrev := s.root
|
rootPrev := s.root
|
||||||
s.root = newSnapshot
|
s.root = newSnapshot
|
||||||
|
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||||
// release lock
|
// release lock
|
||||||
s.rootLock.Unlock()
|
s.rootLock.Unlock()
|
||||||
|
|
||||||
|
@ -191,42 +236,113 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
func (s *Scorch) introducePersist(persist *persistIntroduction) {
|
||||||
// acquire lock
|
atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1)
|
||||||
|
defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1)
|
||||||
|
|
||||||
s.rootLock.Lock()
|
s.rootLock.Lock()
|
||||||
|
root := s.root
|
||||||
|
root.AddRef()
|
||||||
|
nextSnapshotEpoch := s.nextSnapshotEpoch
|
||||||
|
s.nextSnapshotEpoch++
|
||||||
|
s.rootLock.Unlock()
|
||||||
|
|
||||||
// prepare new index snapshot
|
defer func() { _ = root.DecRef() }()
|
||||||
currSize := len(s.root.segment)
|
|
||||||
newSize := currSize + 1 - len(nextMerge.old)
|
|
||||||
|
|
||||||
// empty segments deletion
|
newIndexSnapshot := &IndexSnapshot{
|
||||||
if nextMerge.new == nil {
|
parent: s,
|
||||||
newSize--
|
epoch: nextSnapshotEpoch,
|
||||||
|
segment: make([]*SegmentSnapshot, len(root.segment)),
|
||||||
|
offsets: make([]uint64, len(root.offsets)),
|
||||||
|
internal: make(map[string][]byte, len(root.internal)),
|
||||||
|
refs: 1,
|
||||||
|
creator: "introducePersist",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var docsToPersistCount, memSegments, fileSegments uint64
|
||||||
|
for i, segmentSnapshot := range root.segment {
|
||||||
|
// see if this segment has been replaced
|
||||||
|
if replacement, ok := persist.persisted[segmentSnapshot.id]; ok {
|
||||||
|
newSegmentSnapshot := &SegmentSnapshot{
|
||||||
|
id: segmentSnapshot.id,
|
||||||
|
segment: replacement,
|
||||||
|
deleted: segmentSnapshot.deleted,
|
||||||
|
cachedDocs: segmentSnapshot.cachedDocs,
|
||||||
|
creator: "introducePersist",
|
||||||
|
}
|
||||||
|
newIndexSnapshot.segment[i] = newSegmentSnapshot
|
||||||
|
delete(persist.persisted, segmentSnapshot.id)
|
||||||
|
|
||||||
|
// update items persisted incase of a new segment snapshot
|
||||||
|
atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count())
|
||||||
|
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
|
||||||
|
fileSegments++
|
||||||
|
} else {
|
||||||
|
newIndexSnapshot.segment[i] = root.segment[i]
|
||||||
|
newIndexSnapshot.segment[i].segment.AddRef()
|
||||||
|
|
||||||
|
if isMemorySegment(root.segment[i]) {
|
||||||
|
docsToPersistCount += root.segment[i].Count()
|
||||||
|
memSegments++
|
||||||
|
} else {
|
||||||
|
fileSegments++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newIndexSnapshot.offsets[i] = root.offsets[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v := range root.internal {
|
||||||
|
newIndexSnapshot.internal[k] = v
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||||
|
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||||
|
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||||
|
newIndexSnapshot.updateSize()
|
||||||
|
s.rootLock.Lock()
|
||||||
|
rootPrev := s.root
|
||||||
|
s.root = newIndexSnapshot
|
||||||
|
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||||
|
s.rootLock.Unlock()
|
||||||
|
|
||||||
|
if rootPrev != nil {
|
||||||
|
_ = rootPrev.DecRef()
|
||||||
|
}
|
||||||
|
|
||||||
|
close(persist.applied)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
||||||
|
atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1)
|
||||||
|
defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1)
|
||||||
|
|
||||||
|
s.rootLock.RLock()
|
||||||
|
root := s.root
|
||||||
|
root.AddRef()
|
||||||
|
s.rootLock.RUnlock()
|
||||||
|
|
||||||
|
defer func() { _ = root.DecRef() }()
|
||||||
|
|
||||||
newSnapshot := &IndexSnapshot{
|
newSnapshot := &IndexSnapshot{
|
||||||
parent: s,
|
parent: s,
|
||||||
segment: make([]*SegmentSnapshot, 0, newSize),
|
internal: root.internal,
|
||||||
offsets: make([]uint64, 0, newSize),
|
|
||||||
internal: s.root.internal,
|
|
||||||
epoch: s.nextSnapshotEpoch,
|
|
||||||
refs: 1,
|
refs: 1,
|
||||||
|
creator: "introduceMerge",
|
||||||
}
|
}
|
||||||
s.nextSnapshotEpoch++
|
|
||||||
|
|
||||||
// iterate through current segments
|
// iterate through current segments
|
||||||
newSegmentDeleted := roaring.NewBitmap()
|
newSegmentDeleted := roaring.NewBitmap()
|
||||||
var running uint64
|
var running, docsToPersistCount, memSegments, fileSegments uint64
|
||||||
for i := range s.root.segment {
|
for i := range root.segment {
|
||||||
segmentID := s.root.segment[i].id
|
segmentID := root.segment[i].id
|
||||||
if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok {
|
if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok {
|
||||||
// this segment is going away, see if anything else was deleted since we started the merge
|
// this segment is going away, see if anything else was deleted since we started the merge
|
||||||
if segSnapAtMerge != nil && s.root.segment[i].deleted != nil {
|
if segSnapAtMerge != nil && root.segment[i].deleted != nil {
|
||||||
// assume all these deletes are new
|
// assume all these deletes are new
|
||||||
deletedSince := s.root.segment[i].deleted
|
deletedSince := root.segment[i].deleted
|
||||||
// if we already knew about some of them, remove
|
// if we already knew about some of them, remove
|
||||||
if segSnapAtMerge.deleted != nil {
|
if segSnapAtMerge.deleted != nil {
|
||||||
deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted)
|
deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted)
|
||||||
}
|
}
|
||||||
deletedSinceItr := deletedSince.Iterator()
|
deletedSinceItr := deletedSince.Iterator()
|
||||||
for deletedSinceItr.HasNext() {
|
for deletedSinceItr.HasNext() {
|
||||||
|
@ -240,18 +356,25 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
||||||
// segments left behind in old map after processing
|
// segments left behind in old map after processing
|
||||||
// the root segments would be the obsolete segment set
|
// the root segments would be the obsolete segment set
|
||||||
delete(nextMerge.old, segmentID)
|
delete(nextMerge.old, segmentID)
|
||||||
|
} else if root.segment[i].LiveSize() > 0 {
|
||||||
} else if s.root.segment[i].LiveSize() > 0 {
|
|
||||||
// this segment is staying
|
// this segment is staying
|
||||||
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{
|
newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{
|
||||||
id: s.root.segment[i].id,
|
id: root.segment[i].id,
|
||||||
segment: s.root.segment[i].segment,
|
segment: root.segment[i].segment,
|
||||||
deleted: s.root.segment[i].deleted,
|
deleted: root.segment[i].deleted,
|
||||||
cachedDocs: s.root.segment[i].cachedDocs,
|
cachedDocs: root.segment[i].cachedDocs,
|
||||||
|
creator: root.segment[i].creator,
|
||||||
})
|
})
|
||||||
s.root.segment[i].segment.AddRef()
|
root.segment[i].segment.AddRef()
|
||||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||||
running += s.root.segment[i].Count()
|
running += root.segment[i].segment.Count()
|
||||||
|
|
||||||
|
if isMemorySegment(root.segment[i]) {
|
||||||
|
docsToPersistCount += root.segment[i].Count()
|
||||||
|
memSegments++
|
||||||
|
} else {
|
||||||
|
fileSegments++
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -269,6 +392,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// In case where all the docs in the newly merged segment getting
|
// In case where all the docs in the newly merged segment getting
|
||||||
// deleted by the time we reach here, can skip the introduction.
|
// deleted by the time we reach here, can skip the introduction.
|
||||||
if nextMerge.new != nil &&
|
if nextMerge.new != nil &&
|
||||||
|
@ -279,15 +403,35 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
||||||
segment: nextMerge.new, // take ownership for nextMerge.new's ref-count
|
segment: nextMerge.new, // take ownership for nextMerge.new's ref-count
|
||||||
deleted: newSegmentDeleted,
|
deleted: newSegmentDeleted,
|
||||||
cachedDocs: &cachedDocs{cache: nil},
|
cachedDocs: &cachedDocs{cache: nil},
|
||||||
|
creator: "introduceMerge",
|
||||||
})
|
})
|
||||||
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
newSnapshot.offsets = append(newSnapshot.offsets, running)
|
||||||
|
atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1)
|
||||||
|
|
||||||
|
switch nextMerge.new.(type) {
|
||||||
|
case *zap.SegmentBase:
|
||||||
|
docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality()
|
||||||
|
memSegments++
|
||||||
|
case *zap.Segment:
|
||||||
|
fileSegments++
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||||
|
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||||
|
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||||
|
|
||||||
newSnapshot.AddRef() // 1 ref for the nextMerge.notify response
|
newSnapshot.AddRef() // 1 ref for the nextMerge.notify response
|
||||||
|
|
||||||
// swap in new segment
|
newSnapshot.updateSize()
|
||||||
|
|
||||||
|
s.rootLock.Lock()
|
||||||
|
// swap in new index snapshot
|
||||||
|
newSnapshot.epoch = s.nextSnapshotEpoch
|
||||||
|
s.nextSnapshotEpoch++
|
||||||
rootPrev := s.root
|
rootPrev := s.root
|
||||||
s.root = newSnapshot
|
s.root = newSnapshot
|
||||||
|
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||||
// release lock
|
// release lock
|
||||||
s.rootLock.Unlock()
|
s.rootLock.Unlock()
|
||||||
|
|
||||||
|
@ -301,6 +445,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
||||||
|
atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1)
|
||||||
|
defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1)
|
||||||
|
|
||||||
if revertTo.snapshot == nil {
|
if revertTo.snapshot == nil {
|
||||||
err := fmt.Errorf("Cannot revert to a nil snapshot")
|
err := fmt.Errorf("Cannot revert to a nil snapshot")
|
||||||
revertTo.applied <- err
|
revertTo.applied <- err
|
||||||
|
@ -318,9 +465,11 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
||||||
internal: revertTo.snapshot.internal,
|
internal: revertTo.snapshot.internal,
|
||||||
epoch: s.nextSnapshotEpoch,
|
epoch: s.nextSnapshotEpoch,
|
||||||
refs: 1,
|
refs: 1,
|
||||||
|
creator: "revertToSnapshot",
|
||||||
}
|
}
|
||||||
s.nextSnapshotEpoch++
|
s.nextSnapshotEpoch++
|
||||||
|
|
||||||
|
var docsToPersistCount, memSegments, fileSegments uint64
|
||||||
// iterate through segments
|
// iterate through segments
|
||||||
for i, segmentSnapshot := range revertTo.snapshot.segment {
|
for i, segmentSnapshot := range revertTo.snapshot.segment {
|
||||||
newSnapshot.segment[i] = &SegmentSnapshot{
|
newSnapshot.segment[i] = &SegmentSnapshot{
|
||||||
|
@ -328,21 +477,37 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
||||||
segment: segmentSnapshot.segment,
|
segment: segmentSnapshot.segment,
|
||||||
deleted: segmentSnapshot.deleted,
|
deleted: segmentSnapshot.deleted,
|
||||||
cachedDocs: segmentSnapshot.cachedDocs,
|
cachedDocs: segmentSnapshot.cachedDocs,
|
||||||
|
creator: segmentSnapshot.creator,
|
||||||
}
|
}
|
||||||
newSnapshot.segment[i].segment.AddRef()
|
newSnapshot.segment[i].segment.AddRef()
|
||||||
|
|
||||||
// remove segment from ineligibleForRemoval map
|
// remove segment from ineligibleForRemoval map
|
||||||
filename := zapFileName(segmentSnapshot.id)
|
filename := zapFileName(segmentSnapshot.id)
|
||||||
delete(s.ineligibleForRemoval, filename)
|
delete(s.ineligibleForRemoval, filename)
|
||||||
|
|
||||||
|
if isMemorySegment(segmentSnapshot) {
|
||||||
|
docsToPersistCount += segmentSnapshot.Count()
|
||||||
|
memSegments++
|
||||||
|
} else {
|
||||||
|
fileSegments++
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount)
|
||||||
|
atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments)
|
||||||
|
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments)
|
||||||
|
|
||||||
if revertTo.persisted != nil {
|
if revertTo.persisted != nil {
|
||||||
s.rootPersisted = append(s.rootPersisted, revertTo.persisted)
|
s.rootPersisted = append(s.rootPersisted, revertTo.persisted)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
newSnapshot.updateSize()
|
||||||
|
|
||||||
// swap in new snapshot
|
// swap in new snapshot
|
||||||
rootPrev := s.root
|
rootPrev := s.root
|
||||||
s.root = newSnapshot
|
s.root = newSnapshot
|
||||||
|
|
||||||
|
atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch)
|
||||||
// release lock
|
// release lock
|
||||||
s.rootLock.Unlock()
|
s.rootLock.Unlock()
|
||||||
|
|
||||||
|
@ -354,3 +519,12 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error {
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isMemorySegment(s *SegmentSnapshot) bool {
|
||||||
|
switch s.segment.(type) {
|
||||||
|
case *zap.SegmentBase:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -15,9 +15,7 @@
|
||||||
package scorch
|
package scorch
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
@ -40,16 +38,20 @@ func (s *Scorch) mergerLoop() {
|
||||||
|
|
||||||
OUTER:
|
OUTER:
|
||||||
for {
|
for {
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeLoopBeg, 1)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
break OUTER
|
break OUTER
|
||||||
|
|
||||||
default:
|
default:
|
||||||
// check to see if there is a new snapshot to persist
|
// check to see if there is a new snapshot to persist
|
||||||
s.rootLock.RLock()
|
s.rootLock.Lock()
|
||||||
ourSnapshot := s.root
|
ourSnapshot := s.root
|
||||||
ourSnapshot.AddRef()
|
ourSnapshot.AddRef()
|
||||||
s.rootLock.RUnlock()
|
atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size()))
|
||||||
|
atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch)
|
||||||
|
s.rootLock.Unlock()
|
||||||
|
|
||||||
if ourSnapshot.epoch != lastEpochMergePlanned {
|
if ourSnapshot.epoch != lastEpochMergePlanned {
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
|
@ -57,12 +59,21 @@ OUTER:
|
||||||
// lets get started
|
// lets get started
|
||||||
err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions)
|
err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
atomic.StoreUint64(&s.iStats.mergeEpoch, 0)
|
||||||
|
if err == segment.ErrClosed {
|
||||||
|
// index has been closed
|
||||||
|
_ = ourSnapshot.DecRef()
|
||||||
|
break OUTER
|
||||||
|
}
|
||||||
s.fireAsyncError(fmt.Errorf("merging err: %v", err))
|
s.fireAsyncError(fmt.Errorf("merging err: %v", err))
|
||||||
_ = ourSnapshot.DecRef()
|
_ = ourSnapshot.DecRef()
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1)
|
||||||
continue OUTER
|
continue OUTER
|
||||||
}
|
}
|
||||||
lastEpochMergePlanned = ourSnapshot.epoch
|
lastEpochMergePlanned = ourSnapshot.epoch
|
||||||
|
|
||||||
|
atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch)
|
||||||
|
|
||||||
s.fireEvent(EventKindMergerProgress, time.Since(startTime))
|
s.fireEvent(EventKindMergerProgress, time.Since(startTime))
|
||||||
}
|
}
|
||||||
_ = ourSnapshot.DecRef()
|
_ = ourSnapshot.DecRef()
|
||||||
|
@ -88,7 +99,10 @@ OUTER:
|
||||||
case <-ew.notifyCh:
|
case <-ew.notifyCh:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeLoopEnd, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
s.asyncTasks.Done()
|
s.asyncTasks.Done()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -105,6 +119,11 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions,
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return &mergePlannerOptions, err
|
return &mergePlannerOptions, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = mergeplan.ValidateMergePlannerOptions(&mergePlannerOptions)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return &mergePlannerOptions, nil
|
return &mergePlannerOptions, nil
|
||||||
}
|
}
|
||||||
|
@ -119,32 +138,45 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlan, 1)
|
||||||
|
|
||||||
// give this list to the planner
|
// give this list to the planner
|
||||||
resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options)
|
resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1)
|
||||||
return fmt.Errorf("merge planning err: %v", err)
|
return fmt.Errorf("merge planning err: %v", err)
|
||||||
}
|
}
|
||||||
if resultMergePlan == nil {
|
if resultMergePlan == nil {
|
||||||
// nothing to do
|
// nothing to do
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1)
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks)))
|
||||||
|
|
||||||
// process tasks in serial for now
|
// process tasks in serial for now
|
||||||
var notifications []chan *IndexSnapshot
|
var notifications []chan *IndexSnapshot
|
||||||
for _, task := range resultMergePlan.Tasks {
|
for _, task := range resultMergePlan.Tasks {
|
||||||
if len(task.Segments) == 0 {
|
if len(task.Segments) == 0 {
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments)))
|
||||||
|
|
||||||
oldMap := make(map[uint64]*SegmentSnapshot)
|
oldMap := make(map[uint64]*SegmentSnapshot)
|
||||||
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
|
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
|
||||||
segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments))
|
segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments))
|
||||||
docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
|
docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments))
|
||||||
|
|
||||||
for _, planSegment := range task.Segments {
|
for _, planSegment := range task.Segments {
|
||||||
if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok {
|
if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok {
|
||||||
oldMap[segSnapshot.id] = segSnapshot
|
oldMap[segSnapshot.id] = segSnapshot
|
||||||
if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok {
|
if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok {
|
||||||
if segSnapshot.LiveSize() == 0 {
|
if segSnapshot.LiveSize() == 0 {
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1)
|
||||||
oldMap[segSnapshot.id] = nil
|
oldMap[segSnapshot.id] = nil
|
||||||
} else {
|
} else {
|
||||||
segmentsToMerge = append(segmentsToMerge, zapSeg)
|
segmentsToMerge = append(segmentsToMerge, zapSeg)
|
||||||
|
@ -155,32 +187,53 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
|
||||||
}
|
}
|
||||||
|
|
||||||
var oldNewDocNums map[uint64][]uint64
|
var oldNewDocNums map[uint64][]uint64
|
||||||
var segment segment.Segment
|
var seg segment.Segment
|
||||||
if len(segmentsToMerge) > 0 {
|
if len(segmentsToMerge) > 0 {
|
||||||
filename := zapFileName(newSegmentID)
|
filename := zapFileName(newSegmentID)
|
||||||
s.markIneligibleForRemoval(filename)
|
s.markIneligibleForRemoval(filename)
|
||||||
path := s.path + string(os.PathSeparator) + filename
|
path := s.path + string(os.PathSeparator) + filename
|
||||||
newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024)
|
|
||||||
|
fileMergeZapStartTime := time.Now()
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1)
|
||||||
|
newDocNums, _, err := zap.Merge(segmentsToMerge, docsToDrop, path,
|
||||||
|
DefaultChunkFactor, s.closeCh, s)
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1)
|
||||||
|
|
||||||
|
fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime))
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime)
|
||||||
|
if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime {
|
||||||
|
atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime)
|
||||||
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.unmarkIneligibleForRemoval(filename)
|
s.unmarkIneligibleForRemoval(filename)
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
|
||||||
|
if err == segment.ErrClosed {
|
||||||
|
return err
|
||||||
|
}
|
||||||
return fmt.Errorf("merging failed: %v", err)
|
return fmt.Errorf("merging failed: %v", err)
|
||||||
}
|
}
|
||||||
segment, err = zap.Open(path)
|
|
||||||
|
seg, err = zap.Open(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
s.unmarkIneligibleForRemoval(filename)
|
s.unmarkIneligibleForRemoval(filename)
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
oldNewDocNums = make(map[uint64][]uint64)
|
oldNewDocNums = make(map[uint64][]uint64)
|
||||||
for i, segNewDocNums := range newDocNums {
|
for i, segNewDocNums := range newDocNums {
|
||||||
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums
|
oldNewDocNums[task.Segments[i].Id()] = segNewDocNums
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge)))
|
||||||
}
|
}
|
||||||
|
|
||||||
sm := &segmentMerge{
|
sm := &segmentMerge{
|
||||||
id: newSegmentID,
|
id: newSegmentID,
|
||||||
old: oldMap,
|
old: oldMap,
|
||||||
oldNewDocNums: oldNewDocNums,
|
oldNewDocNums: oldNewDocNums,
|
||||||
new: segment,
|
new: seg,
|
||||||
notify: make(chan *IndexSnapshot, 1),
|
notify: make(chan *IndexSnapshot, 1),
|
||||||
}
|
}
|
||||||
notifications = append(notifications, sm.notify)
|
notifications = append(notifications, sm.notify)
|
||||||
|
@ -188,21 +241,28 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot,
|
||||||
// give it to the introducer
|
// give it to the introducer
|
||||||
select {
|
select {
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
_ = segment.Close()
|
_ = seg.Close()
|
||||||
return nil
|
return segment.ErrClosed
|
||||||
case s.merges <- sm:
|
case s.merges <- sm:
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, notification := range notifications {
|
for _, notification := range notifications {
|
||||||
select {
|
select {
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
return nil
|
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1)
|
||||||
|
return segment.ErrClosed
|
||||||
case newSnapshot := <-notification:
|
case newSnapshot := <-notification:
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1)
|
||||||
if newSnapshot != nil {
|
if newSnapshot != nil {
|
||||||
_ = newSnapshot.DecRef()
|
_ = newSnapshot.DecRef()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -219,44 +279,48 @@ type segmentMerge struct {
|
||||||
// into the root
|
// into the root
|
||||||
func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
|
func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
|
||||||
sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int,
|
sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int,
|
||||||
chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) {
|
chunkFactor uint32) (*IndexSnapshot, uint64, error) {
|
||||||
var br bytes.Buffer
|
atomic.AddUint64(&s.stats.TotMemMergeBeg, 1)
|
||||||
|
|
||||||
cr := zap.NewCountHashWriter(&br)
|
memMergeZapStartTime := time.Now()
|
||||||
|
|
||||||
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset,
|
atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1)
|
||||||
docValueOffset, dictLocs, fieldsInv, fieldsMap, err :=
|
|
||||||
zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr)
|
|
||||||
if err != nil {
|
|
||||||
return 0, nil, 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
sb, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor,
|
|
||||||
fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset,
|
|
||||||
docValueOffset, dictLocs)
|
|
||||||
if err != nil {
|
|
||||||
return 0, nil, 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
|
newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1)
|
||||||
|
|
||||||
filename := zapFileName(newSegmentID)
|
filename := zapFileName(newSegmentID)
|
||||||
path := s.path + string(os.PathSeparator) + filename
|
path := s.path + string(os.PathSeparator) + filename
|
||||||
err = zap.PersistSegmentBase(sb, path)
|
|
||||||
if err != nil {
|
newDocNums, _, err :=
|
||||||
return 0, nil, 0, err
|
zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh, s)
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1)
|
||||||
|
|
||||||
|
memMergeZapTime := uint64(time.Since(memMergeZapStartTime))
|
||||||
|
atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime)
|
||||||
|
if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime {
|
||||||
|
atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime)
|
||||||
}
|
}
|
||||||
|
|
||||||
segment, err := zap.Open(path)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, 0, err
|
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
|
||||||
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
seg, err := zap.Open(path)
|
||||||
|
if err != nil {
|
||||||
|
atomic.AddUint64(&s.stats.TotMemMergeErr, 1)
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// update persisted stats
|
||||||
|
atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count())
|
||||||
|
atomic.AddUint64(&s.stats.TotPersistedSegments, 1)
|
||||||
|
|
||||||
sm := &segmentMerge{
|
sm := &segmentMerge{
|
||||||
id: newSegmentID,
|
id: newSegmentID,
|
||||||
old: make(map[uint64]*SegmentSnapshot),
|
old: make(map[uint64]*SegmentSnapshot),
|
||||||
oldNewDocNums: make(map[uint64][]uint64),
|
oldNewDocNums: make(map[uint64][]uint64),
|
||||||
new: segment,
|
new: seg,
|
||||||
notify: make(chan *IndexSnapshot, 1),
|
notify: make(chan *IndexSnapshot, 1),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -268,15 +332,21 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot,
|
||||||
|
|
||||||
select { // send to introducer
|
select { // send to introducer
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
_ = segment.DecRef()
|
_ = seg.DecRef()
|
||||||
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed?
|
return nil, 0, segment.ErrClosed
|
||||||
case s.merges <- sm:
|
case s.merges <- sm:
|
||||||
}
|
}
|
||||||
|
|
||||||
select { // wait for introduction to complete
|
select { // wait for introduction to complete
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
return 0, nil, 0, nil // TODO: return ErrInterruptedClosed?
|
return nil, 0, segment.ErrClosed
|
||||||
case newSnapshot := <-sm.notify:
|
case newSnapshot := <-sm.notify:
|
||||||
return numDocs, newSnapshot, newSegmentID, nil
|
atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs)))
|
||||||
|
atomic.AddUint64(&s.stats.TotMemMergeDone, 1)
|
||||||
|
return newSnapshot, newSegmentID, nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Scorch) ReportBytesWritten(bytesWritten uint64) {
|
||||||
|
atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, bytesWritten)
|
||||||
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
package mergeplan
|
package mergeplan
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
"sort"
|
"sort"
|
||||||
|
@ -115,7 +116,15 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 {
|
||||||
return o.FloorSegmentSize
|
return o.FloorSegmentSize
|
||||||
}
|
}
|
||||||
|
|
||||||
// Suggested default options.
|
// MaxSegmentSizeLimit represents the maximum size of a segment,
|
||||||
|
// this limit comes with hit-1 optimisation/max encoding limit uint31.
|
||||||
|
const MaxSegmentSizeLimit = 1<<31 - 1
|
||||||
|
|
||||||
|
// ErrMaxSegmentSizeTooLarge is returned when the size of the segment
|
||||||
|
// exceeds the MaxSegmentSizeLimit
|
||||||
|
var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit")
|
||||||
|
|
||||||
|
// DefaultMergePlanOptions suggests the default options.
|
||||||
var DefaultMergePlanOptions = MergePlanOptions{
|
var DefaultMergePlanOptions = MergePlanOptions{
|
||||||
MaxSegmentsPerTier: 10,
|
MaxSegmentsPerTier: 10,
|
||||||
MaxSegmentSize: 5000000,
|
MaxSegmentSize: 5000000,
|
||||||
|
@ -208,14 +217,14 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) {
|
||||||
if len(roster) > 0 {
|
if len(roster) > 0 {
|
||||||
rosterScore := scoreSegments(roster, o)
|
rosterScore := scoreSegments(roster, o)
|
||||||
|
|
||||||
if len(bestRoster) <= 0 || rosterScore < bestRosterScore {
|
if len(bestRoster) == 0 || rosterScore < bestRosterScore {
|
||||||
bestRoster = roster
|
bestRoster = roster
|
||||||
bestRosterScore = rosterScore
|
bestRosterScore = rosterScore
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(bestRoster) <= 0 {
|
if len(bestRoster) == 0 {
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -367,3 +376,11 @@ func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan)
|
||||||
|
|
||||||
return strings.Join(rv, "\n")
|
return strings.Join(rv, "\n")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ValidateMergePlannerOptions validates the merge planner options
|
||||||
|
func ValidateMergePlannerOptions(options *MergePlanOptions) error {
|
||||||
|
if options.MaxSegmentSize > MaxSegmentSizeLimit {
|
||||||
|
return ErrMaxSegmentSizeTooLarge
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,420 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package scorch
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"github.com/RoaringBitmap/roaring"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/index"
|
||||||
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
|
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||||
|
)
|
||||||
|
|
||||||
|
var OptimizeConjunction = true
|
||||||
|
var OptimizeConjunctionUnadorned = true
|
||||||
|
var OptimizeDisjunctionUnadorned = true
|
||||||
|
|
||||||
|
func (s *IndexSnapshotTermFieldReader) Optimize(kind string,
|
||||||
|
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||||
|
if OptimizeConjunction && kind == "conjunction" {
|
||||||
|
return s.optimizeConjunction(octx)
|
||||||
|
}
|
||||||
|
|
||||||
|
if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" {
|
||||||
|
return s.optimizeConjunctionUnadorned(octx)
|
||||||
|
}
|
||||||
|
|
||||||
|
if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" {
|
||||||
|
return s.optimizeDisjunctionUnadorned(octx)
|
||||||
|
}
|
||||||
|
|
||||||
|
return octx, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256)
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
func (s *IndexSnapshotTermFieldReader) optimizeConjunction(
|
||||||
|
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||||
|
if octx == nil {
|
||||||
|
octx = &OptimizeTFRConjunction{snapshot: s.snapshot}
|
||||||
|
}
|
||||||
|
|
||||||
|
o, ok := octx.(*OptimizeTFRConjunction)
|
||||||
|
if !ok {
|
||||||
|
return octx, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if o.snapshot != s.snapshot {
|
||||||
|
return nil, fmt.Errorf("tried to optimize conjunction across different snapshots")
|
||||||
|
}
|
||||||
|
|
||||||
|
o.tfrs = append(o.tfrs, s)
|
||||||
|
|
||||||
|
return o, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type OptimizeTFRConjunction struct {
|
||||||
|
snapshot *IndexSnapshot
|
||||||
|
|
||||||
|
tfrs []*IndexSnapshotTermFieldReader
|
||||||
|
}
|
||||||
|
|
||||||
|
func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) {
|
||||||
|
if len(o.tfrs) <= 1 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range o.snapshot.segment {
|
||||||
|
itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator)
|
||||||
|
if !ok || itr0.ActualBM == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator)
|
||||||
|
if !ok || itr1.ActualBM == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
bm := roaring.And(itr0.ActualBM, itr1.ActualBM)
|
||||||
|
|
||||||
|
for _, tfr := range o.tfrs[2:] {
|
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||||
|
if !ok || itr.ActualBM == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
bm.And(itr.ActualBM)
|
||||||
|
}
|
||||||
|
|
||||||
|
// in this conjunction optimization, the postings iterators
|
||||||
|
// will all share the same AND'ed together actual bitmap. The
|
||||||
|
// regular conjunction searcher machinery will still be used,
|
||||||
|
// but the underlying bitmap will be smaller.
|
||||||
|
for _, tfr := range o.tfrs {
|
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||||
|
if ok && itr.ActualBM != nil {
|
||||||
|
itr.ActualBM = bm
|
||||||
|
itr.Actual = bm.Iterator()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
// An "unadorned" conjunction optimization is appropriate when
|
||||||
|
// additional or subsidiary information like freq-norm's and
|
||||||
|
// term-vectors are not required, and instead only the internal-id's
|
||||||
|
// are needed.
|
||||||
|
func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned(
|
||||||
|
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||||
|
if octx == nil {
|
||||||
|
octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot}
|
||||||
|
}
|
||||||
|
|
||||||
|
o, ok := octx.(*OptimizeTFRConjunctionUnadorned)
|
||||||
|
if !ok {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if o.snapshot != s.snapshot {
|
||||||
|
return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots")
|
||||||
|
}
|
||||||
|
|
||||||
|
o.tfrs = append(o.tfrs, s)
|
||||||
|
|
||||||
|
return o, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type OptimizeTFRConjunctionUnadorned struct {
|
||||||
|
snapshot *IndexSnapshot
|
||||||
|
|
||||||
|
tfrs []*IndexSnapshotTermFieldReader
|
||||||
|
}
|
||||||
|
|
||||||
|
var OptimizeTFRConjunctionUnadornedTerm = []byte("<conjunction:unadorned>")
|
||||||
|
var OptimizeTFRConjunctionUnadornedField = "*"
|
||||||
|
|
||||||
|
// Finish of an unadorned conjunction optimization will compute a
|
||||||
|
// termFieldReader with an "actual" bitmap that represents the
|
||||||
|
// constituent bitmaps AND'ed together. This termFieldReader cannot
|
||||||
|
// provide any freq-norm or termVector associated information.
|
||||||
|
func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) {
|
||||||
|
if len(o.tfrs) <= 1 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// We use an artificial term and field because the optimized
|
||||||
|
// termFieldReader can represent multiple terms and fields.
|
||||||
|
oTFR := &IndexSnapshotTermFieldReader{
|
||||||
|
term: OptimizeTFRConjunctionUnadornedTerm,
|
||||||
|
field: OptimizeTFRConjunctionUnadornedField,
|
||||||
|
snapshot: o.snapshot,
|
||||||
|
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
|
||||||
|
segmentOffset: 0,
|
||||||
|
includeFreq: false,
|
||||||
|
includeNorm: false,
|
||||||
|
includeTermVectors: false,
|
||||||
|
}
|
||||||
|
|
||||||
|
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||||
|
|
||||||
|
OUTER:
|
||||||
|
for i := range o.snapshot.segment {
|
||||||
|
actualBMs = actualBMs[:0]
|
||||||
|
|
||||||
|
var docNum1HitLast uint64
|
||||||
|
var docNum1HitLastOk bool
|
||||||
|
|
||||||
|
for _, tfr := range o.tfrs {
|
||||||
|
if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok {
|
||||||
|
// An empty postings iterator means the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||||
|
continue OUTER
|
||||||
|
}
|
||||||
|
|
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||||
|
if !ok {
|
||||||
|
// We optimize zap postings iterators only.
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the postings iterator is "1-hit" optimized, then we
|
||||||
|
// can perform several optimizations up-front here.
|
||||||
|
docNum1Hit, ok := itr.DocNum1Hit()
|
||||||
|
if ok {
|
||||||
|
if docNum1Hit == zap.DocNum1HitFinished {
|
||||||
|
// An empty docNum here means the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||||
|
continue OUTER
|
||||||
|
}
|
||||||
|
|
||||||
|
if docNum1HitLastOk && docNum1HitLast != docNum1Hit {
|
||||||
|
// The docNum1Hit doesn't match the previous
|
||||||
|
// docNum1HitLast, so the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||||
|
continue OUTER
|
||||||
|
}
|
||||||
|
|
||||||
|
docNum1HitLast = docNum1Hit
|
||||||
|
docNum1HitLastOk = true
|
||||||
|
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if itr.ActualBM == nil {
|
||||||
|
// An empty actual bitmap means the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||||
|
continue OUTER
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collect the actual bitmap for more processing later.
|
||||||
|
actualBMs = append(actualBMs, itr.ActualBM)
|
||||||
|
}
|
||||||
|
|
||||||
|
if docNum1HitLastOk {
|
||||||
|
// We reach here if all the 1-hit optimized posting
|
||||||
|
// iterators had the same 1-hit docNum, so we can check if
|
||||||
|
// our collected actual bitmaps also have that docNum.
|
||||||
|
for _, bm := range actualBMs {
|
||||||
|
if !bm.Contains(uint32(docNum1HitLast)) {
|
||||||
|
// The docNum1Hit isn't in one of our actual
|
||||||
|
// bitmaps, so the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||||
|
continue OUTER
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// The actual bitmaps and docNum1Hits all contain or have
|
||||||
|
// the same 1-hit docNum, so that's our AND'ed result.
|
||||||
|
oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit(
|
||||||
|
docNum1HitLast, zap.NormBits1Hit, false, false)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
continue OUTER
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(actualBMs) == 0 {
|
||||||
|
// If we've collected no actual bitmaps at this point,
|
||||||
|
// then the entire AND is empty.
|
||||||
|
oTFR.iterators[i] = segment.AnEmptyPostingsIterator
|
||||||
|
continue OUTER
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(actualBMs) == 1 {
|
||||||
|
// If we've only 1 actual bitmap, then that's our result.
|
||||||
|
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
|
||||||
|
actualBMs[0], false, false)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
continue OUTER
|
||||||
|
}
|
||||||
|
|
||||||
|
// Else, AND together our collected bitmaps as our result.
|
||||||
|
bm := roaring.And(actualBMs[0], actualBMs[1])
|
||||||
|
|
||||||
|
for _, actualBM := range actualBMs[2:] {
|
||||||
|
bm.And(actualBM)
|
||||||
|
}
|
||||||
|
|
||||||
|
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(
|
||||||
|
bm, false, false)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return oTFR, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
// An "unadorned" disjunction optimization is appropriate when
|
||||||
|
// additional or subsidiary information like freq-norm's and
|
||||||
|
// term-vectors are not required, and instead only the internal-id's
|
||||||
|
// are needed.
|
||||||
|
func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned(
|
||||||
|
octx index.OptimizableContext) (index.OptimizableContext, error) {
|
||||||
|
if octx == nil {
|
||||||
|
octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot}
|
||||||
|
}
|
||||||
|
|
||||||
|
o, ok := octx.(*OptimizeTFRDisjunctionUnadorned)
|
||||||
|
if !ok {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if o.snapshot != s.snapshot {
|
||||||
|
return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots")
|
||||||
|
}
|
||||||
|
|
||||||
|
o.tfrs = append(o.tfrs, s)
|
||||||
|
|
||||||
|
return o, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type OptimizeTFRDisjunctionUnadorned struct {
|
||||||
|
snapshot *IndexSnapshot
|
||||||
|
|
||||||
|
tfrs []*IndexSnapshotTermFieldReader
|
||||||
|
}
|
||||||
|
|
||||||
|
var OptimizeTFRDisjunctionUnadornedTerm = []byte("<disjunction:unadorned>")
|
||||||
|
var OptimizeTFRDisjunctionUnadornedField = "*"
|
||||||
|
|
||||||
|
// Finish of an unadorned disjunction optimization will compute a
|
||||||
|
// termFieldReader with an "actual" bitmap that represents the
|
||||||
|
// constituent bitmaps OR'ed together. This termFieldReader cannot
|
||||||
|
// provide any freq-norm or termVector associated information.
|
||||||
|
func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) {
|
||||||
|
if len(o.tfrs) <= 1 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range o.snapshot.segment {
|
||||||
|
var cMax uint64
|
||||||
|
|
||||||
|
for _, tfr := range o.tfrs {
|
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||||
|
if !ok {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if itr.ActualBM != nil {
|
||||||
|
c := itr.ActualBM.GetCardinality()
|
||||||
|
if cMax < c {
|
||||||
|
cMax = c
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Heuristic to skip the optimization if all the constituent
|
||||||
|
// bitmaps are too small, where the processing & resource
|
||||||
|
// overhead to create the OR'ed bitmap outweighs the benefit.
|
||||||
|
if cMax < OptimizeDisjunctionUnadornedMinChildCardinality {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We use an artificial term and field because the optimized
|
||||||
|
// termFieldReader can represent multiple terms and fields.
|
||||||
|
oTFR := &IndexSnapshotTermFieldReader{
|
||||||
|
term: OptimizeTFRDisjunctionUnadornedTerm,
|
||||||
|
field: OptimizeTFRDisjunctionUnadornedField,
|
||||||
|
snapshot: o.snapshot,
|
||||||
|
iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)),
|
||||||
|
segmentOffset: 0,
|
||||||
|
includeFreq: false,
|
||||||
|
includeNorm: false,
|
||||||
|
includeTermVectors: false,
|
||||||
|
}
|
||||||
|
|
||||||
|
var docNums []uint32 // Collected docNum's from 1-hit posting lists.
|
||||||
|
var actualBMs []*roaring.Bitmap // Collected from regular posting lists.
|
||||||
|
|
||||||
|
for i := range o.snapshot.segment {
|
||||||
|
docNums = docNums[:0]
|
||||||
|
actualBMs = actualBMs[:0]
|
||||||
|
|
||||||
|
for _, tfr := range o.tfrs {
|
||||||
|
itr, ok := tfr.iterators[i].(*zap.PostingsIterator)
|
||||||
|
if !ok {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
docNum, ok := itr.DocNum1Hit()
|
||||||
|
if ok {
|
||||||
|
docNums = append(docNums, uint32(docNum))
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if itr.ActualBM != nil {
|
||||||
|
actualBMs = append(actualBMs, itr.ActualBM)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var bm *roaring.Bitmap
|
||||||
|
if len(actualBMs) > 2 {
|
||||||
|
bm = roaring.HeapOr(actualBMs...)
|
||||||
|
} else if len(actualBMs) == 2 {
|
||||||
|
bm = roaring.Or(actualBMs[0], actualBMs[1])
|
||||||
|
} else if len(actualBMs) == 1 {
|
||||||
|
bm = actualBMs[0].Clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
if bm == nil {
|
||||||
|
bm = roaring.New()
|
||||||
|
}
|
||||||
|
|
||||||
|
bm.AddMany(docNums)
|
||||||
|
|
||||||
|
oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return oTFR, nil
|
||||||
|
}
|
|
@ -16,9 +16,12 @@ package scorch
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"log"
|
"log"
|
||||||
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strconv"
|
"strconv"
|
||||||
|
@ -27,23 +30,57 @@ import (
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||||
"github.com/boltdb/bolt"
|
bolt "github.com/etcd-io/bbolt"
|
||||||
)
|
)
|
||||||
|
|
||||||
var DefaultChunkFactor uint32 = 1024
|
var DefaultChunkFactor uint32 = 1024
|
||||||
|
|
||||||
// Arbitrary number, need to make it configurable.
|
// DefaultPersisterNapTimeMSec is kept to zero as this helps in direct
|
||||||
// Lower values like 10/making persister really slow
|
// persistence of segments with the default safe batch option.
|
||||||
// doesn't work well as it is creating more files to
|
// If the default safe batch option results in high number of
|
||||||
// persist for in next persist iteration and spikes the # FDs.
|
// files on disk, then users may initialise this configuration parameter
|
||||||
// Ideal value should let persister also proceed at
|
// with higher values so that the persister will nap a bit within it's
|
||||||
// an optimum pace so that the merger can skip
|
// work loop to favour better in-memory merging of segments to result
|
||||||
// many intermediate snapshots.
|
// in fewer segment files on disk. But that may come with an indexing
|
||||||
// This needs to be based on empirical data.
|
// performance overhead.
|
||||||
// TODO - may need to revisit this approach/value.
|
// Unsafe batch users are advised to override this to higher value
|
||||||
var epochDistance = uint64(5)
|
// for better performance especially with high data density.
|
||||||
|
var DefaultPersisterNapTimeMSec int = 0 // ms
|
||||||
|
|
||||||
|
// DefaultPersisterNapUnderNumFiles helps in controlling the pace of
|
||||||
|
// persister. At times of a slow merger progress with heavy file merging
|
||||||
|
// operations, its better to pace down the persister for letting the merger
|
||||||
|
// to catch up within a range defined by this parameter.
|
||||||
|
// Fewer files on disk (as per the merge plan) would result in keeping the
|
||||||
|
// file handle usage under limit, faster disk merger and a healthier index.
|
||||||
|
// Its been observed that such a loosely sync'ed introducer-persister-merger
|
||||||
|
// trio results in better overall performance.
|
||||||
|
var DefaultPersisterNapUnderNumFiles int = 1000
|
||||||
|
|
||||||
|
var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64
|
||||||
|
|
||||||
|
type persisterOptions struct {
|
||||||
|
// PersisterNapTimeMSec controls the wait/delay injected into
|
||||||
|
// persistence workloop to improve the chances for
|
||||||
|
// a healthier and heavier in-memory merging
|
||||||
|
PersisterNapTimeMSec int
|
||||||
|
|
||||||
|
// PersisterNapTimeMSec > 0, and the number of files is less than
|
||||||
|
// PersisterNapUnderNumFiles, then the persister will sleep
|
||||||
|
// PersisterNapTimeMSec amount of time to improve the chances for
|
||||||
|
// a healthier and heavier in-memory merging
|
||||||
|
PersisterNapUnderNumFiles int
|
||||||
|
|
||||||
|
// MemoryPressurePauseThreshold let persister to have a better leeway
|
||||||
|
// for prudently performing the memory merge of segments on a memory
|
||||||
|
// pressure situation. Here the config value is an upper threshold
|
||||||
|
// for the number of paused application threads. The default value would
|
||||||
|
// be a very high number to always favour the merging of memory segments.
|
||||||
|
MemoryPressurePauseThreshold uint64
|
||||||
|
}
|
||||||
|
|
||||||
type notificationChan chan struct{}
|
type notificationChan chan struct{}
|
||||||
|
|
||||||
|
@ -53,8 +90,17 @@ func (s *Scorch) persisterLoop() {
|
||||||
var persistWatchers []*epochWatcher
|
var persistWatchers []*epochWatcher
|
||||||
var lastPersistedEpoch, lastMergedEpoch uint64
|
var lastPersistedEpoch, lastMergedEpoch uint64
|
||||||
var ew *epochWatcher
|
var ew *epochWatcher
|
||||||
|
po, err := s.parsePersisterOptions()
|
||||||
|
if err != nil {
|
||||||
|
s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err))
|
||||||
|
s.asyncTasks.Done()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
OUTER:
|
OUTER:
|
||||||
for {
|
for {
|
||||||
|
atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
break OUTER
|
break OUTER
|
||||||
|
@ -65,11 +111,13 @@ OUTER:
|
||||||
if ew != nil && ew.epoch > lastMergedEpoch {
|
if ew != nil && ew.epoch > lastMergedEpoch {
|
||||||
lastMergedEpoch = ew.epoch
|
lastMergedEpoch = ew.epoch
|
||||||
}
|
}
|
||||||
persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
|
|
||||||
&lastMergedEpoch, persistWatchers)
|
lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch,
|
||||||
|
lastMergedEpoch, persistWatchers, po)
|
||||||
|
|
||||||
var ourSnapshot *IndexSnapshot
|
var ourSnapshot *IndexSnapshot
|
||||||
var ourPersisted []chan error
|
var ourPersisted []chan error
|
||||||
|
var ourPersistedCallbacks []index.BatchCallback
|
||||||
|
|
||||||
// check to see if there is a new snapshot to persist
|
// check to see if there is a new snapshot to persist
|
||||||
s.rootLock.Lock()
|
s.rootLock.Lock()
|
||||||
|
@ -78,13 +126,17 @@ OUTER:
|
||||||
ourSnapshot.AddRef()
|
ourSnapshot.AddRef()
|
||||||
ourPersisted = s.rootPersisted
|
ourPersisted = s.rootPersisted
|
||||||
s.rootPersisted = nil
|
s.rootPersisted = nil
|
||||||
|
ourPersistedCallbacks = s.persistedCallbacks
|
||||||
|
s.persistedCallbacks = nil
|
||||||
|
atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size()))
|
||||||
|
atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch)
|
||||||
}
|
}
|
||||||
s.rootLock.Unlock()
|
s.rootLock.Unlock()
|
||||||
|
|
||||||
if ourSnapshot != nil {
|
if ourSnapshot != nil {
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
|
|
||||||
err := s.persistSnapshot(ourSnapshot)
|
err := s.persistSnapshot(ourSnapshot, po)
|
||||||
for _, ch := range ourPersisted {
|
for _, ch := range ourPersisted {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
ch <- err
|
ch <- err
|
||||||
|
@ -92,10 +144,22 @@ OUTER:
|
||||||
close(ch)
|
close(ch)
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
atomic.StoreUint64(&s.iStats.persistEpoch, 0)
|
||||||
|
if err == segment.ErrClosed {
|
||||||
|
// index has been closed
|
||||||
|
_ = ourSnapshot.DecRef()
|
||||||
|
break OUTER
|
||||||
|
}
|
||||||
s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
|
s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err))
|
||||||
_ = ourSnapshot.DecRef()
|
_ = ourSnapshot.DecRef()
|
||||||
|
atomic.AddUint64(&s.stats.TotPersistLoopErr, 1)
|
||||||
continue OUTER
|
continue OUTER
|
||||||
}
|
}
|
||||||
|
for i := range ourPersistedCallbacks {
|
||||||
|
ourPersistedCallbacks[i](err)
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch)
|
||||||
|
|
||||||
lastPersistedEpoch = ourSnapshot.epoch
|
lastPersistedEpoch = ourSnapshot.epoch
|
||||||
for _, ew := range persistWatchers {
|
for _, ew := range persistWatchers {
|
||||||
|
@ -115,6 +179,8 @@ OUTER:
|
||||||
s.fireEvent(EventKindPersisterProgress, time.Since(startTime))
|
s.fireEvent(EventKindPersisterProgress, time.Since(startTime))
|
||||||
|
|
||||||
if changed {
|
if changed {
|
||||||
|
s.removeOldData()
|
||||||
|
atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1)
|
||||||
continue OUTER
|
continue OUTER
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -133,17 +199,21 @@ OUTER:
|
||||||
|
|
||||||
s.removeOldData() // might as well cleanup while waiting
|
s.removeOldData() // might as well cleanup while waiting
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotPersistLoopWait, 1)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
break OUTER
|
break OUTER
|
||||||
case <-w.notifyCh:
|
case <-w.notifyCh:
|
||||||
// woken up, next loop should pick up work
|
// woken up, next loop should pick up work
|
||||||
continue OUTER
|
atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1)
|
||||||
case ew = <-s.persisterNotifier:
|
case ew = <-s.persisterNotifier:
|
||||||
// if the watchers are already caught up then let them wait,
|
// if the watchers are already caught up then let them wait,
|
||||||
// else let them continue to do the catch up
|
// else let them continue to do the catch up
|
||||||
persistWatchers = append(persistWatchers, ew)
|
persistWatchers = append(persistWatchers, ew)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -160,32 +230,88 @@ func notifyMergeWatchers(lastPersistedEpoch uint64,
|
||||||
return watchersNext
|
return watchersNext
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64,
|
func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64,
|
||||||
persistWatchers []*epochWatcher) []*epochWatcher {
|
persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) {
|
||||||
|
|
||||||
// first, let the watchers proceed if they lag behind
|
// first, let the watchers proceed if they lag behind
|
||||||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
||||||
|
|
||||||
|
// check the merger lag by counting the segment files on disk,
|
||||||
|
// On finding fewer files on disk, persister takes a short pause
|
||||||
|
// for sufficient in-memory segments to pile up for the next
|
||||||
|
// memory merge cum persist loop.
|
||||||
|
// On finding too many files on disk, persister pause until the merger
|
||||||
|
// catches up to reduce the segment file count under the threshold.
|
||||||
|
// But if there is memory pressure, then skip this sleep maneuvers.
|
||||||
|
numFilesOnDisk, _ := s.diskFileStats()
|
||||||
|
if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) &&
|
||||||
|
po.PersisterNapTimeMSec > 0 && s.paused() == 0 {
|
||||||
|
select {
|
||||||
|
case <-s.closeCh:
|
||||||
|
case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)):
|
||||||
|
atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1)
|
||||||
|
|
||||||
|
case ew := <-s.persisterNotifier:
|
||||||
|
// unblock the merger in meantime
|
||||||
|
persistWatchers = append(persistWatchers, ew)
|
||||||
|
lastMergedEpoch = ew.epoch
|
||||||
|
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
||||||
|
atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1)
|
||||||
|
}
|
||||||
|
return lastMergedEpoch, persistWatchers
|
||||||
|
}
|
||||||
|
|
||||||
OUTER:
|
OUTER:
|
||||||
// check for slow merger and await until the merger catch up
|
for po.PersisterNapUnderNumFiles > 0 &&
|
||||||
for lastPersistedEpoch > *lastMergedEpoch+epochDistance {
|
numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) &&
|
||||||
|
lastMergedEpoch < lastPersistedEpoch {
|
||||||
|
atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1)
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-s.closeCh:
|
case <-s.closeCh:
|
||||||
break OUTER
|
break OUTER
|
||||||
case ew := <-s.persisterNotifier:
|
case ew := <-s.persisterNotifier:
|
||||||
persistWatchers = append(persistWatchers, ew)
|
persistWatchers = append(persistWatchers, ew)
|
||||||
*lastMergedEpoch = ew.epoch
|
lastMergedEpoch = ew.epoch
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1)
|
||||||
|
|
||||||
// let the watchers proceed if they lag behind
|
// let the watchers proceed if they lag behind
|
||||||
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers)
|
||||||
|
|
||||||
|
numFilesOnDisk, _ = s.diskFileStats()
|
||||||
}
|
}
|
||||||
|
|
||||||
return persistWatchers
|
return lastMergedEpoch, persistWatchers
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error {
|
func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) {
|
||||||
|
po := persisterOptions{
|
||||||
|
PersisterNapTimeMSec: DefaultPersisterNapTimeMSec,
|
||||||
|
PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles,
|
||||||
|
MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold,
|
||||||
|
}
|
||||||
|
if v, ok := s.config["scorchPersisterOptions"]; ok {
|
||||||
|
b, err := json.Marshal(v)
|
||||||
|
if err != nil {
|
||||||
|
return &po, err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = json.Unmarshal(b, &po)
|
||||||
|
if err != nil {
|
||||||
|
return &po, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return &po, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot,
|
||||||
|
po *persisterOptions) error {
|
||||||
|
// Perform in-memory segment merging only when the memory pressure is
|
||||||
|
// below the configured threshold, else the persister performs the
|
||||||
|
// direct persistence of segments.
|
||||||
|
if s.paused() < po.MemoryPressurePauseThreshold {
|
||||||
persisted, err := s.persistSnapshotMaybeMerge(snapshot)
|
persisted, err := s.persistSnapshotMaybeMerge(snapshot)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -193,6 +319,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error {
|
||||||
if persisted {
|
if persisted {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return s.persistSnapshotDirect(snapshot)
|
return s.persistSnapshotDirect(snapshot)
|
||||||
}
|
}
|
||||||
|
@ -224,7 +351,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
|
||||||
return false, nil
|
return false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
_, newSnapshot, newSegmentID, err := s.mergeSegmentBases(
|
newSnapshot, newSegmentID, err := s.mergeSegmentBases(
|
||||||
snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor)
|
snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false, err
|
return false, err
|
||||||
|
@ -249,6 +376,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) (
|
||||||
segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)),
|
segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)),
|
||||||
internal: snapshot.internal,
|
internal: snapshot.internal,
|
||||||
epoch: snapshot.epoch,
|
epoch: snapshot.epoch,
|
||||||
|
creator: "persistSnapshotMaybeMerge",
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy to the equiv the segments that weren't replaced
|
// copy to the equiv the segments that weren't replaced
|
||||||
|
@ -301,6 +429,22 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// persist meta values
|
||||||
|
metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
err = metaBucket.Put([]byte("type"), []byte(zap.Type))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
buf := make([]byte, binary.MaxVarintLen32)
|
||||||
|
binary.BigEndian.PutUint32(buf, zap.Version)
|
||||||
|
err = metaBucket.Put([]byte("version"), buf)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// persist internal values
|
// persist internal values
|
||||||
internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey)
|
internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -390,44 +534,21 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
s.rootLock.Lock()
|
persist := &persistIntroduction{
|
||||||
newIndexSnapshot := &IndexSnapshot{
|
persisted: newSegments,
|
||||||
parent: s,
|
applied: make(notificationChan),
|
||||||
epoch: s.nextSnapshotEpoch,
|
|
||||||
segment: make([]*SegmentSnapshot, len(s.root.segment)),
|
|
||||||
offsets: make([]uint64, len(s.root.offsets)),
|
|
||||||
internal: make(map[string][]byte, len(s.root.internal)),
|
|
||||||
refs: 1,
|
|
||||||
}
|
|
||||||
s.nextSnapshotEpoch++
|
|
||||||
for i, segmentSnapshot := range s.root.segment {
|
|
||||||
// see if this segment has been replaced
|
|
||||||
if replacement, ok := newSegments[segmentSnapshot.id]; ok {
|
|
||||||
newSegmentSnapshot := &SegmentSnapshot{
|
|
||||||
id: segmentSnapshot.id,
|
|
||||||
segment: replacement,
|
|
||||||
deleted: segmentSnapshot.deleted,
|
|
||||||
cachedDocs: segmentSnapshot.cachedDocs,
|
|
||||||
}
|
|
||||||
newIndexSnapshot.segment[i] = newSegmentSnapshot
|
|
||||||
delete(newSegments, segmentSnapshot.id)
|
|
||||||
// update items persisted incase of a new segment snapshot
|
|
||||||
atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count())
|
|
||||||
} else {
|
|
||||||
newIndexSnapshot.segment[i] = s.root.segment[i]
|
|
||||||
newIndexSnapshot.segment[i].segment.AddRef()
|
|
||||||
}
|
|
||||||
newIndexSnapshot.offsets[i] = s.root.offsets[i]
|
|
||||||
}
|
|
||||||
for k, v := range s.root.internal {
|
|
||||||
newIndexSnapshot.internal[k] = v
|
|
||||||
}
|
}
|
||||||
|
|
||||||
rootPrev := s.root
|
select {
|
||||||
s.root = newIndexSnapshot
|
case <-s.closeCh:
|
||||||
s.rootLock.Unlock()
|
return segment.ErrClosed
|
||||||
if rootPrev != nil {
|
case s.persists <- persist:
|
||||||
_ = rootPrev.DecRef()
|
}
|
||||||
|
|
||||||
|
select {
|
||||||
|
case <-s.closeCh:
|
||||||
|
return segment.ErrClosed
|
||||||
|
case <-persist.applied:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -462,6 +583,7 @@ var boltSnapshotsBucket = []byte{'s'}
|
||||||
var boltPathKey = []byte{'p'}
|
var boltPathKey = []byte{'p'}
|
||||||
var boltDeletedKey = []byte{'d'}
|
var boltDeletedKey = []byte{'d'}
|
||||||
var boltInternalKey = []byte{'i'}
|
var boltInternalKey = []byte{'i'}
|
||||||
|
var boltMetaDataKey = []byte{'m'}
|
||||||
|
|
||||||
func (s *Scorch) loadFromBolt() error {
|
func (s *Scorch) loadFromBolt() error {
|
||||||
return s.rootBolt.View(func(tx *bolt.Tx) error {
|
return s.rootBolt.View(func(tx *bolt.Tx) error {
|
||||||
|
@ -478,19 +600,19 @@ func (s *Scorch) loadFromBolt() error {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if foundRoot {
|
if foundRoot {
|
||||||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch)
|
s.AddEligibleForRemoval(snapshotEpoch)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
snapshot := snapshots.Bucket(k)
|
snapshot := snapshots.Bucket(k)
|
||||||
if snapshot == nil {
|
if snapshot == nil {
|
||||||
log.Printf("snapshot key, but bucket missing %x, continuing", k)
|
log.Printf("snapshot key, but bucket missing %x, continuing", k)
|
||||||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch)
|
s.AddEligibleForRemoval(snapshotEpoch)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
indexSnapshot, err := s.loadSnapshot(snapshot)
|
indexSnapshot, err := s.loadSnapshot(snapshot)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("unable to load snapshot, %v, continuing", err)
|
log.Printf("unable to load snapshot, %v, continuing", err)
|
||||||
s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch)
|
s.AddEligibleForRemoval(snapshotEpoch)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
indexSnapshot.epoch = snapshotEpoch
|
indexSnapshot.epoch = snapshotEpoch
|
||||||
|
@ -500,13 +622,16 @@ func (s *Scorch) loadFromBolt() error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
s.nextSegmentID++
|
s.nextSegmentID++
|
||||||
s.nextSnapshotEpoch = snapshotEpoch + 1
|
|
||||||
s.rootLock.Lock()
|
s.rootLock.Lock()
|
||||||
if s.root != nil {
|
s.nextSnapshotEpoch = snapshotEpoch + 1
|
||||||
_ = s.root.DecRef()
|
rootPrev := s.root
|
||||||
}
|
|
||||||
s.root = indexSnapshot
|
s.root = indexSnapshot
|
||||||
s.rootLock.Unlock()
|
s.rootLock.Unlock()
|
||||||
|
|
||||||
|
if rootPrev != nil {
|
||||||
|
_ = rootPrev.DecRef()
|
||||||
|
}
|
||||||
|
|
||||||
foundRoot = true
|
foundRoot = true
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
@ -524,7 +649,7 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
|
||||||
snapshotKey := segment.EncodeUvarintAscending(nil, epoch)
|
snapshotKey := segment.EncodeUvarintAscending(nil, epoch)
|
||||||
snapshot := snapshots.Bucket(snapshotKey)
|
snapshot := snapshots.Bucket(snapshotKey)
|
||||||
if snapshot == nil {
|
if snapshot == nil {
|
||||||
return nil
|
return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch)
|
||||||
}
|
}
|
||||||
rv, err = s.loadSnapshot(snapshot)
|
rv, err = s.loadSnapshot(snapshot)
|
||||||
return err
|
return err
|
||||||
|
@ -536,12 +661,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
||||||
|
|
||||||
rv := &IndexSnapshot{
|
rv := &IndexSnapshot{
|
||||||
parent: s,
|
parent: s,
|
||||||
internal: make(map[string][]byte),
|
internal: make(map[string][]byte),
|
||||||
refs: 1,
|
refs: 1,
|
||||||
|
creator: "loadSnapshot",
|
||||||
}
|
}
|
||||||
|
|
||||||
var running uint64
|
var running uint64
|
||||||
c := snapshot.Cursor()
|
c := snapshot.Cursor()
|
||||||
for k, _ := c.First(); k != nil; k, _ = c.Next() {
|
for k, _ := c.First(); k != nil; k, _ = c.Next() {
|
||||||
|
@ -556,7 +682,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
||||||
_ = rv.DecRef()
|
_ = rv.DecRef()
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
} else {
|
} else if k[0] != boltMetaDataKey[0] {
|
||||||
segmentBucket := snapshot.Bucket(k)
|
segmentBucket := snapshot.Bucket(k)
|
||||||
if segmentBucket == nil {
|
if segmentBucket == nil {
|
||||||
_ = rv.DecRef()
|
_ = rv.DecRef()
|
||||||
|
@ -577,6 +703,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) {
|
||||||
running += segmentSnapshot.segment.Count()
|
running += segmentSnapshot.segment.Count()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -604,8 +731,10 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro
|
||||||
_ = segment.Close()
|
_ = segment.Close()
|
||||||
return nil, fmt.Errorf("error reading deleted bytes: %v", err)
|
return nil, fmt.Errorf("error reading deleted bytes: %v", err)
|
||||||
}
|
}
|
||||||
|
if !deletedBitmap.IsEmpty() {
|
||||||
rv.deleted = deletedBitmap
|
rv.deleted = deletedBitmap
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
@ -643,14 +772,14 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(persistedEpochs) <= NumSnapshotsToKeep {
|
if len(persistedEpochs) <= s.numSnapshotsToKeep {
|
||||||
// we need to keep everything
|
// we need to keep everything
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// make a map of epochs to protect from deletion
|
// make a map of epochs to protect from deletion
|
||||||
protectedEpochs := make(map[uint64]struct{}, NumSnapshotsToKeep)
|
protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep)
|
||||||
for _, epoch := range persistedEpochs[0:NumSnapshotsToKeep] {
|
for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] {
|
||||||
protectedEpochs[epoch] = struct{}{}
|
protectedEpochs[epoch] = struct{}{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -668,7 +797,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) {
|
||||||
s.eligibleForRemoval = newEligible
|
s.eligibleForRemoval = newEligible
|
||||||
s.rootLock.Unlock()
|
s.rootLock.Unlock()
|
||||||
|
|
||||||
if len(epochsToRemove) <= 0 {
|
if len(epochsToRemove) == 0 {
|
||||||
return 0, nil
|
return 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,110 +0,0 @@
|
||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package scorch
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/blevesearch/bleve/document"
|
|
||||||
"github.com/blevesearch/bleve/index"
|
|
||||||
)
|
|
||||||
|
|
||||||
type Reader struct {
|
|
||||||
root *IndexSnapshot // Owns 1 ref-count on the index snapshot.
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) TermFieldReader(term []byte, field string, includeFreq,
|
|
||||||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
|
|
||||||
return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors)
|
|
||||||
}
|
|
||||||
|
|
||||||
// DocIDReader returns an iterator over all doc ids
|
|
||||||
// The caller must close returned instance to release associated resources.
|
|
||||||
func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) {
|
|
||||||
return r.root.DocIDReaderAll()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) {
|
|
||||||
return r.root.DocIDReaderOnly(ids)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) FieldDict(field string) (index.FieldDict, error) {
|
|
||||||
return r.root.FieldDict(field)
|
|
||||||
}
|
|
||||||
|
|
||||||
// FieldDictRange is currently defined to include the start and end terms
|
|
||||||
func (r *Reader) FieldDictRange(field string, startTerm []byte,
|
|
||||||
endTerm []byte) (index.FieldDict, error) {
|
|
||||||
return r.root.FieldDictRange(field, startTerm, endTerm)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) FieldDictPrefix(field string,
|
|
||||||
termPrefix []byte) (index.FieldDict, error) {
|
|
||||||
return r.root.FieldDictPrefix(field, termPrefix)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) Document(id string) (*document.Document, error) {
|
|
||||||
return r.root.Document(id)
|
|
||||||
}
|
|
||||||
func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string,
|
|
||||||
visitor index.DocumentFieldTermVisitor) error {
|
|
||||||
return r.root.DocumentVisitFieldTerms(id, fields, visitor)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) Fields() ([]string, error) {
|
|
||||||
return r.root.Fields()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) GetInternal(key []byte) ([]byte, error) {
|
|
||||||
return r.root.GetInternal(key)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) DocCount() (uint64, error) {
|
|
||||||
return r.root.DocCount()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) {
|
|
||||||
return r.root.ExternalID(id)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) InternalID(id string) (index.IndexInternalID, error) {
|
|
||||||
return r.root.InternalID(id)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) DumpAll() chan interface{} {
|
|
||||||
rv := make(chan interface{})
|
|
||||||
go func() {
|
|
||||||
close(rv)
|
|
||||||
}()
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) DumpDoc(id string) chan interface{} {
|
|
||||||
rv := make(chan interface{})
|
|
||||||
go func() {
|
|
||||||
close(rv)
|
|
||||||
}()
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) DumpFields() chan interface{} {
|
|
||||||
rv := make(chan interface{})
|
|
||||||
go func() {
|
|
||||||
close(rv)
|
|
||||||
}()
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
func (r *Reader) Close() error {
|
|
||||||
return r.root.DecRef()
|
|
||||||
}
|
|
|
@ -17,6 +17,7 @@ package scorch
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
@ -27,23 +28,24 @@ import (
|
||||||
"github.com/blevesearch/bleve/document"
|
"github.com/blevesearch/bleve/document"
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
"github.com/blevesearch/bleve/index/scorch/segment/zap"
|
||||||
"github.com/blevesearch/bleve/index/store"
|
"github.com/blevesearch/bleve/index/store"
|
||||||
"github.com/blevesearch/bleve/registry"
|
"github.com/blevesearch/bleve/registry"
|
||||||
"github.com/boltdb/bolt"
|
bolt "github.com/etcd-io/bbolt"
|
||||||
)
|
)
|
||||||
|
|
||||||
const Name = "scorch"
|
const Name = "scorch"
|
||||||
|
|
||||||
const Version uint8 = 1
|
const Version uint8 = 2
|
||||||
|
|
||||||
|
var ErrClosed = fmt.Errorf("scorch closed")
|
||||||
|
|
||||||
type Scorch struct {
|
type Scorch struct {
|
||||||
readOnly bool
|
readOnly bool
|
||||||
version uint8
|
version uint8
|
||||||
config map[string]interface{}
|
config map[string]interface{}
|
||||||
analysisQueue *index.AnalysisQueue
|
analysisQueue *index.AnalysisQueue
|
||||||
stats *Stats
|
stats Stats
|
||||||
nextSegmentID uint64
|
nextSegmentID uint64
|
||||||
path string
|
path string
|
||||||
|
|
||||||
|
@ -52,12 +54,15 @@ type Scorch struct {
|
||||||
rootLock sync.RWMutex
|
rootLock sync.RWMutex
|
||||||
root *IndexSnapshot // holds 1 ref-count on the root
|
root *IndexSnapshot // holds 1 ref-count on the root
|
||||||
rootPersisted []chan error // closed when root is persisted
|
rootPersisted []chan error // closed when root is persisted
|
||||||
|
persistedCallbacks []index.BatchCallback
|
||||||
nextSnapshotEpoch uint64
|
nextSnapshotEpoch uint64
|
||||||
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC.
|
eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC.
|
||||||
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet.
|
ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet.
|
||||||
|
|
||||||
|
numSnapshotsToKeep int
|
||||||
closeCh chan struct{}
|
closeCh chan struct{}
|
||||||
introductions chan *segmentIntroduction
|
introductions chan *segmentIntroduction
|
||||||
|
persists chan *persistIntroduction
|
||||||
merges chan *segmentMerge
|
merges chan *segmentMerge
|
||||||
introducerNotifier chan *epochWatcher
|
introducerNotifier chan *epochWatcher
|
||||||
revertToSnapshots chan *snapshotReversion
|
revertToSnapshots chan *snapshotReversion
|
||||||
|
@ -67,6 +72,23 @@ type Scorch struct {
|
||||||
|
|
||||||
onEvent func(event Event)
|
onEvent func(event Event)
|
||||||
onAsyncError func(err error)
|
onAsyncError func(err error)
|
||||||
|
|
||||||
|
iStats internalStats
|
||||||
|
|
||||||
|
pauseLock sync.RWMutex
|
||||||
|
|
||||||
|
pauseCount uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
type internalStats struct {
|
||||||
|
persistEpoch uint64
|
||||||
|
persistSnapshotSize uint64
|
||||||
|
mergeEpoch uint64
|
||||||
|
mergeSnapshotSize uint64
|
||||||
|
newSegBufBytesAdded uint64
|
||||||
|
newSegBufBytesRemoved uint64
|
||||||
|
analysisBytesAdded uint64
|
||||||
|
analysisBytesRemoved uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewScorch(storeName string,
|
func NewScorch(storeName string,
|
||||||
|
@ -80,8 +102,7 @@ func NewScorch(storeName string,
|
||||||
closeCh: make(chan struct{}),
|
closeCh: make(chan struct{}),
|
||||||
ineligibleForRemoval: map[string]bool{},
|
ineligibleForRemoval: map[string]bool{},
|
||||||
}
|
}
|
||||||
rv.stats = &Stats{i: rv}
|
rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"}
|
||||||
rv.root = &IndexSnapshot{parent: rv, refs: 1}
|
|
||||||
ro, ok := config["read_only"].(bool)
|
ro, ok := config["read_only"].(bool)
|
||||||
if ok {
|
if ok {
|
||||||
rv.readOnly = ro
|
rv.readOnly = ro
|
||||||
|
@ -101,9 +122,30 @@ func NewScorch(storeName string,
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Scorch) paused() uint64 {
|
||||||
|
s.pauseLock.Lock()
|
||||||
|
pc := s.pauseCount
|
||||||
|
s.pauseLock.Unlock()
|
||||||
|
return pc
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scorch) incrPause() {
|
||||||
|
s.pauseLock.Lock()
|
||||||
|
s.pauseCount++
|
||||||
|
s.pauseLock.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scorch) decrPause() {
|
||||||
|
s.pauseLock.Lock()
|
||||||
|
s.pauseCount--
|
||||||
|
s.pauseLock.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) {
|
func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) {
|
||||||
if s.onEvent != nil {
|
if s.onEvent != nil {
|
||||||
|
s.incrPause()
|
||||||
s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
|
s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur})
|
||||||
|
s.decrPause()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -111,6 +153,7 @@ func (s *Scorch) fireAsyncError(err error) {
|
||||||
if s.onAsyncError != nil {
|
if s.onAsyncError != nil {
|
||||||
s.onAsyncError(err)
|
s.onAsyncError(err)
|
||||||
}
|
}
|
||||||
|
atomic.AddUint64(&s.stats.TotOnErrors, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) Open() error {
|
func (s *Scorch) Open() error {
|
||||||
|
@ -172,7 +215,10 @@ func (s *Scorch) openBolt() error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, uint64(len(s.root.segment)))
|
||||||
|
|
||||||
s.introductions = make(chan *segmentIntroduction)
|
s.introductions = make(chan *segmentIntroduction)
|
||||||
|
s.persists = make(chan *persistIntroduction)
|
||||||
s.merges = make(chan *segmentMerge)
|
s.merges = make(chan *segmentMerge)
|
||||||
s.introducerNotifier = make(chan *epochWatcher, 1)
|
s.introducerNotifier = make(chan *epochWatcher, 1)
|
||||||
s.revertToSnapshots = make(chan *snapshotReversion)
|
s.revertToSnapshots = make(chan *snapshotReversion)
|
||||||
|
@ -186,6 +232,17 @@ func (s *Scorch) openBolt() error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
s.numSnapshotsToKeep = NumSnapshotsToKeep
|
||||||
|
if v, ok := s.config["numSnapshotsToKeep"]; ok {
|
||||||
|
var t int
|
||||||
|
if t, err = parseToInteger(v); err != nil {
|
||||||
|
return fmt.Errorf("numSnapshotsToKeep parse err: %v", err)
|
||||||
|
}
|
||||||
|
if t > 0 {
|
||||||
|
s.numSnapshotsToKeep = t
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -255,6 +312,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
|
||||||
|
|
||||||
// FIXME could sort ids list concurrent with analysis?
|
// FIXME could sort ids list concurrent with analysis?
|
||||||
|
|
||||||
|
if len(batch.IndexOps) > 0 {
|
||||||
go func() {
|
go func() {
|
||||||
for _, doc := range batch.IndexOps {
|
for _, doc := range batch.IndexOps {
|
||||||
if doc != nil {
|
if doc != nil {
|
||||||
|
@ -264,47 +322,63 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
// wait for analysis result
|
// wait for analysis result
|
||||||
analysisResults := make([]*index.AnalysisResult, int(numUpdates))
|
analysisResults := make([]*index.AnalysisResult, int(numUpdates))
|
||||||
var itemsDeQueued uint64
|
var itemsDeQueued uint64
|
||||||
|
var totalAnalysisSize int
|
||||||
for itemsDeQueued < numUpdates {
|
for itemsDeQueued < numUpdates {
|
||||||
result := <-resultChan
|
result := <-resultChan
|
||||||
|
resultSize := result.Size()
|
||||||
|
atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize))
|
||||||
|
totalAnalysisSize += resultSize
|
||||||
analysisResults[itemsDeQueued] = result
|
analysisResults[itemsDeQueued] = result
|
||||||
itemsDeQueued++
|
itemsDeQueued++
|
||||||
}
|
}
|
||||||
close(resultChan)
|
close(resultChan)
|
||||||
|
defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize))
|
||||||
|
|
||||||
atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(start)))
|
atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start)))
|
||||||
|
|
||||||
|
indexStart := time.Now()
|
||||||
|
|
||||||
// notify handlers that we're about to introduce a segment
|
// notify handlers that we're about to introduce a segment
|
||||||
s.fireEvent(EventKindBatchIntroductionStart, 0)
|
s.fireEvent(EventKindBatchIntroductionStart, 0)
|
||||||
|
|
||||||
var newSegment segment.Segment
|
var newSegment segment.Segment
|
||||||
|
var bufBytes uint64
|
||||||
if len(analysisResults) > 0 {
|
if len(analysisResults) > 0 {
|
||||||
newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor)
|
newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes)
|
||||||
|
} else {
|
||||||
|
atomic.AddUint64(&s.stats.TotBatchesEmpty, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
err = s.prepareSegment(newSegment, ids, batch.InternalOps)
|
err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if newSegment != nil {
|
if newSegment != nil {
|
||||||
_ = newSegment.Close()
|
_ = newSegment.Close()
|
||||||
}
|
}
|
||||||
atomic.AddUint64(&s.stats.errors, 1)
|
atomic.AddUint64(&s.stats.TotOnErrors, 1)
|
||||||
} else {
|
} else {
|
||||||
atomic.AddUint64(&s.stats.updates, numUpdates)
|
atomic.AddUint64(&s.stats.TotUpdates, numUpdates)
|
||||||
atomic.AddUint64(&s.stats.deletes, numDeletes)
|
atomic.AddUint64(&s.stats.TotDeletes, numDeletes)
|
||||||
atomic.AddUint64(&s.stats.batches, 1)
|
atomic.AddUint64(&s.stats.TotBatches, 1)
|
||||||
atomic.AddUint64(&s.stats.numPlainTextBytesIndexed, numPlainTextBytes)
|
atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.AddUint64(&s.iStats.newSegBufBytesRemoved, bufBytes)
|
||||||
|
atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart)))
|
||||||
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
||||||
internalOps map[string][]byte) error {
|
internalOps map[string][]byte, persistedCallback index.BatchCallback) error {
|
||||||
|
|
||||||
// new introduction
|
// new introduction
|
||||||
introduction := &segmentIntroduction{
|
introduction := &segmentIntroduction{
|
||||||
|
@ -314,6 +388,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
||||||
obsoletes: make(map[uint64]*roaring.Bitmap),
|
obsoletes: make(map[uint64]*roaring.Bitmap),
|
||||||
internal: internalOps,
|
internal: internalOps,
|
||||||
applied: make(chan error),
|
applied: make(chan error),
|
||||||
|
persistedCallback: persistedCallback,
|
||||||
}
|
}
|
||||||
|
|
||||||
if !s.unsafeBatch {
|
if !s.unsafeBatch {
|
||||||
|
@ -326,6 +401,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
||||||
root.AddRef()
|
root.AddRef()
|
||||||
s.rootLock.RUnlock()
|
s.rootLock.RUnlock()
|
||||||
|
|
||||||
|
defer func() { _ = root.DecRef() }()
|
||||||
|
|
||||||
for _, seg := range root.segment {
|
for _, seg := range root.segment {
|
||||||
delta, err := seg.segment.DocNumbers(ids)
|
delta, err := seg.segment.DocNumbers(ids)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -334,7 +411,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
||||||
introduction.obsoletes[seg.id] = delta
|
introduction.obsoletes[seg.id] = delta
|
||||||
}
|
}
|
||||||
|
|
||||||
_ = root.DecRef()
|
introStartTime := time.Now()
|
||||||
|
|
||||||
s.introductions <- introduction
|
s.introductions <- introduction
|
||||||
|
|
||||||
|
@ -348,6 +425,12 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string,
|
||||||
err = <-introduction.persisted
|
err = <-introduction.persisted
|
||||||
}
|
}
|
||||||
|
|
||||||
|
introTime := uint64(time.Since(introStartTime))
|
||||||
|
atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime)
|
||||||
|
if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime {
|
||||||
|
atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime)
|
||||||
|
}
|
||||||
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -366,18 +449,69 @@ func (s *Scorch) DeleteInternal(key []byte) error {
|
||||||
// Reader returns a low-level accessor on the index data. Close it to
|
// Reader returns a low-level accessor on the index data. Close it to
|
||||||
// release associated resources.
|
// release associated resources.
|
||||||
func (s *Scorch) Reader() (index.IndexReader, error) {
|
func (s *Scorch) Reader() (index.IndexReader, error) {
|
||||||
|
return s.currentSnapshot(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *Scorch) currentSnapshot() *IndexSnapshot {
|
||||||
s.rootLock.RLock()
|
s.rootLock.RLock()
|
||||||
rv := &Reader{root: s.root}
|
rv := s.root
|
||||||
rv.root.AddRef()
|
if rv != nil {
|
||||||
|
rv.AddRef()
|
||||||
|
}
|
||||||
s.rootLock.RUnlock()
|
s.rootLock.RUnlock()
|
||||||
return rv, nil
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) Stats() json.Marshaler {
|
func (s *Scorch) Stats() json.Marshaler {
|
||||||
return s.stats
|
return &s.stats
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *Scorch) diskFileStats() (uint64, uint64) {
|
||||||
|
var numFilesOnDisk, numBytesUsedDisk uint64
|
||||||
|
if s.path != "" {
|
||||||
|
finfos, err := ioutil.ReadDir(s.path)
|
||||||
|
if err == nil {
|
||||||
|
for _, finfo := range finfos {
|
||||||
|
if !finfo.IsDir() {
|
||||||
|
numBytesUsedDisk += uint64(finfo.Size())
|
||||||
|
numFilesOnDisk++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return numFilesOnDisk, numBytesUsedDisk
|
||||||
|
}
|
||||||
|
|
||||||
func (s *Scorch) StatsMap() map[string]interface{} {
|
func (s *Scorch) StatsMap() map[string]interface{} {
|
||||||
m, _ := s.stats.statsMap()
|
m := s.stats.ToMap()
|
||||||
|
|
||||||
|
numFilesOnDisk, numBytesUsedDisk := s.diskFileStats()
|
||||||
|
|
||||||
|
m["CurOnDiskBytes"] = numBytesUsedDisk
|
||||||
|
m["CurOnDiskFiles"] = numFilesOnDisk
|
||||||
|
|
||||||
|
// TODO: consider one day removing these backwards compatible
|
||||||
|
// names for apps using the old names
|
||||||
|
m["updates"] = m["TotUpdates"]
|
||||||
|
m["deletes"] = m["TotDeletes"]
|
||||||
|
m["batches"] = m["TotBatches"]
|
||||||
|
m["errors"] = m["TotOnErrors"]
|
||||||
|
m["analysis_time"] = m["TotAnalysisTime"]
|
||||||
|
m["index_time"] = m["TotIndexTime"]
|
||||||
|
m["term_searchers_started"] = m["TotTermSearchersStarted"]
|
||||||
|
m["term_searchers_finished"] = m["TotTermSearchersFinished"]
|
||||||
|
m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"]
|
||||||
|
m["num_items_introduced"] = m["TotIntroducedItems"]
|
||||||
|
m["num_items_persisted"] = m["TotPersistedItems"]
|
||||||
|
m["num_recs_to_persist"] = m["TotItemsToPersist"]
|
||||||
|
m["num_bytes_used_disk"] = m["CurOnDiskBytes"]
|
||||||
|
m["num_files_on_disk"] = m["CurOnDiskFiles"]
|
||||||
|
m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"]
|
||||||
|
m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"]
|
||||||
|
m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"]
|
||||||
|
m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"]
|
||||||
|
m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"]
|
||||||
|
|
||||||
return m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -394,7 +528,7 @@ func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult {
|
||||||
rv.Analyzed[i] = tokenFreqs
|
rv.Analyzed[i] = tokenFreqs
|
||||||
rv.Length[i] = fieldLength
|
rv.Length[i] = fieldLength
|
||||||
|
|
||||||
if len(d.CompositeFields) > 0 {
|
if len(d.CompositeFields) > 0 && field.Name() != "_id" {
|
||||||
// see if any of the composite fields need this
|
// see if any of the composite fields need this
|
||||||
for _, compositeField := range d.CompositeFields {
|
for _, compositeField := range d.CompositeFields {
|
||||||
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
|
compositeField.Compose(field.Name(), fieldLength, tokenFreqs)
|
||||||
|
@ -418,20 +552,43 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) {
|
||||||
s.rootLock.Unlock()
|
s.rootLock.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Scorch) MemoryUsed() uint64 {
|
func (s *Scorch) MemoryUsed() (memUsed uint64) {
|
||||||
var memUsed uint64
|
indexSnapshot := s.currentSnapshot()
|
||||||
s.rootLock.RLock()
|
if indexSnapshot == nil {
|
||||||
if s.root != nil {
|
return
|
||||||
for _, segmentSnapshot := range s.root.segment {
|
|
||||||
memUsed += 8 /* size of id -> uint64 */ +
|
|
||||||
segmentSnapshot.segment.SizeInBytes()
|
|
||||||
if segmentSnapshot.deleted != nil {
|
|
||||||
memUsed += segmentSnapshot.deleted.GetSizeInBytes()
|
|
||||||
}
|
}
|
||||||
memUsed += segmentSnapshot.cachedDocs.sizeInBytes()
|
|
||||||
|
defer func() {
|
||||||
|
_ = indexSnapshot.Close()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Account for current root snapshot overhead
|
||||||
|
memUsed += uint64(indexSnapshot.Size())
|
||||||
|
|
||||||
|
// Account for snapshot that the persister may be working on
|
||||||
|
persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch)
|
||||||
|
persistSnapshotSize := atomic.LoadUint64(&s.iStats.persistSnapshotSize)
|
||||||
|
if persistEpoch != 0 && indexSnapshot.epoch > persistEpoch {
|
||||||
|
// the snapshot that the persister is working on isn't the same as
|
||||||
|
// the current snapshot
|
||||||
|
memUsed += persistSnapshotSize
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Account for snapshot that the merger may be working on
|
||||||
|
mergeEpoch := atomic.LoadUint64(&s.iStats.mergeEpoch)
|
||||||
|
mergeSnapshotSize := atomic.LoadUint64(&s.iStats.mergeSnapshotSize)
|
||||||
|
if mergeEpoch != 0 && indexSnapshot.epoch > mergeEpoch {
|
||||||
|
// the snapshot that the merger is working on isn't the same as
|
||||||
|
// the current snapshot
|
||||||
|
memUsed += mergeSnapshotSize
|
||||||
}
|
}
|
||||||
s.rootLock.RUnlock()
|
|
||||||
|
memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) -
|
||||||
|
atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved))
|
||||||
|
|
||||||
|
memUsed += (atomic.LoadUint64(&s.iStats.analysisBytesAdded) -
|
||||||
|
atomic.LoadUint64(&s.iStats.analysisBytesRemoved))
|
||||||
|
|
||||||
return memUsed
|
return memUsed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -450,3 +607,15 @@ func (s *Scorch) unmarkIneligibleForRemoval(filename string) {
|
||||||
func init() {
|
func init() {
|
||||||
registry.RegisterIndexType(Name, NewScorch)
|
registry.RegisterIndexType(Name, NewScorch)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func parseToInteger(i interface{}) (int, error) {
|
||||||
|
switch v := i.(type) {
|
||||||
|
case float64:
|
||||||
|
return int(v), nil
|
||||||
|
case int:
|
||||||
|
return v, nil
|
||||||
|
|
||||||
|
default:
|
||||||
|
return 0, fmt.Errorf("expects int or float64 value")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@ package segment
|
||||||
import (
|
import (
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
|
"github.com/couchbase/vellum"
|
||||||
)
|
)
|
||||||
|
|
||||||
type EmptySegment struct{}
|
type EmptySegment struct{}
|
||||||
|
@ -29,6 +30,10 @@ func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisit
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *EmptySegment) DocID(num uint64) ([]byte, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (e *EmptySegment) Count() uint64 {
|
func (e *EmptySegment) Count() uint64 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
@ -46,6 +51,10 @@ func (e *EmptySegment) Close() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *EmptySegment) Size() uint64 {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func (e *EmptySegment) AddRef() {
|
func (e *EmptySegment) AddRef() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -55,8 +64,8 @@ func (e *EmptySegment) DecRef() error {
|
||||||
|
|
||||||
type EmptyDictionary struct{}
|
type EmptyDictionary struct{}
|
||||||
|
|
||||||
func (e *EmptyDictionary) PostingsList(term string,
|
func (e *EmptyDictionary) PostingsList(term []byte,
|
||||||
except *roaring.Bitmap) (PostingsList, error) {
|
except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) {
|
||||||
return &EmptyPostingsList{}, nil
|
return &EmptyPostingsList{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -72,18 +81,37 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator {
|
||||||
return &EmptyDictionaryIterator{}
|
return &EmptyDictionaryIterator{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *EmptyDictionary) AutomatonIterator(a vellum.Automaton,
|
||||||
|
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator {
|
||||||
|
return &EmptyDictionaryIterator{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte,
|
||||||
|
includeCount bool) DictionaryIterator {
|
||||||
|
return &EmptyDictionaryIterator{}
|
||||||
|
}
|
||||||
|
|
||||||
type EmptyDictionaryIterator struct{}
|
type EmptyDictionaryIterator struct{}
|
||||||
|
|
||||||
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
|
func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
type EmptyPostingsList struct{}
|
type EmptyPostingsList struct{}
|
||||||
|
|
||||||
func (e *EmptyPostingsList) Iterator() PostingsIterator {
|
func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool,
|
||||||
|
prealloc PostingsIterator) PostingsIterator {
|
||||||
return &EmptyPostingsIterator{}
|
return &EmptyPostingsIterator{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *EmptyPostingsList) Size() int {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
func (e *EmptyPostingsList) Count() uint64 {
|
func (e *EmptyPostingsList) Count() uint64 {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
@ -93,3 +121,9 @@ type EmptyPostingsIterator struct{}
|
||||||
func (e *EmptyPostingsIterator) Next() (Posting, error) {
|
func (e *EmptyPostingsIterator) Next() (Posting, error) {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (e *EmptyPostingsIterator) Size() int {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
var AnEmptyPostingsIterator = &EmptyPostingsIterator{}
|
||||||
|
|
|
@ -1,321 +0,0 @@
|
||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package mem
|
|
||||||
|
|
||||||
import (
|
|
||||||
"math"
|
|
||||||
"sort"
|
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
|
||||||
"github.com/blevesearch/bleve/document"
|
|
||||||
"github.com/blevesearch/bleve/index"
|
|
||||||
)
|
|
||||||
|
|
||||||
// NewFromAnalyzedDocs places the analyzed document mutations into a new segment
|
|
||||||
func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment {
|
|
||||||
s := New()
|
|
||||||
|
|
||||||
// ensure that _id field get fieldID 0
|
|
||||||
s.getOrDefineField("_id")
|
|
||||||
|
|
||||||
// fill Dicts/DictKeys and preallocate memory
|
|
||||||
s.initializeDict(results)
|
|
||||||
|
|
||||||
// walk each doc
|
|
||||||
for _, result := range results {
|
|
||||||
s.processDocument(result)
|
|
||||||
}
|
|
||||||
|
|
||||||
// go back and sort the dictKeys
|
|
||||||
for _, dict := range s.DictKeys {
|
|
||||||
sort.Strings(dict)
|
|
||||||
}
|
|
||||||
|
|
||||||
// compute memory usage of segment
|
|
||||||
s.updateSizeInBytes()
|
|
||||||
|
|
||||||
// professional debugging
|
|
||||||
//
|
|
||||||
// log.Printf("fields: %v\n", s.FieldsMap)
|
|
||||||
// log.Printf("fieldsInv: %v\n", s.FieldsInv)
|
|
||||||
// log.Printf("fieldsLoc: %v\n", s.FieldsLoc)
|
|
||||||
// log.Printf("dicts: %v\n", s.Dicts)
|
|
||||||
// log.Printf("dict keys: %v\n", s.DictKeys)
|
|
||||||
// for i, posting := range s.Postings {
|
|
||||||
// log.Printf("posting %d: %v\n", i, posting)
|
|
||||||
// }
|
|
||||||
// for i, freq := range s.Freqs {
|
|
||||||
// log.Printf("freq %d: %v\n", i, freq)
|
|
||||||
// }
|
|
||||||
// for i, norm := range s.Norms {
|
|
||||||
// log.Printf("norm %d: %v\n", i, norm)
|
|
||||||
// }
|
|
||||||
// for i, field := range s.Locfields {
|
|
||||||
// log.Printf("field %d: %v\n", i, field)
|
|
||||||
// }
|
|
||||||
// for i, start := range s.Locstarts {
|
|
||||||
// log.Printf("start %d: %v\n", i, start)
|
|
||||||
// }
|
|
||||||
// for i, end := range s.Locends {
|
|
||||||
// log.Printf("end %d: %v\n", i, end)
|
|
||||||
// }
|
|
||||||
// for i, pos := range s.Locpos {
|
|
||||||
// log.Printf("pos %d: %v\n", i, pos)
|
|
||||||
// }
|
|
||||||
// for i, apos := range s.Locarraypos {
|
|
||||||
// log.Printf("apos %d: %v\n", i, apos)
|
|
||||||
// }
|
|
||||||
// log.Printf("stored: %v\n", s.Stored)
|
|
||||||
// log.Printf("stored types: %v\n", s.StoredTypes)
|
|
||||||
// log.Printf("stored pos: %v\n", s.StoredPos)
|
|
||||||
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
// fill Dicts/DictKeys and preallocate memory for postings
|
|
||||||
func (s *Segment) initializeDict(results []*index.AnalysisResult) {
|
|
||||||
var numPostingsLists int
|
|
||||||
|
|
||||||
numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
|
||||||
numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id.
|
|
||||||
|
|
||||||
var numTokenFrequencies int
|
|
||||||
var totLocs int
|
|
||||||
|
|
||||||
// initial scan for all fieldID's to sort them
|
|
||||||
for _, result := range results {
|
|
||||||
for _, field := range result.Document.CompositeFields {
|
|
||||||
s.getOrDefineField(field.Name())
|
|
||||||
}
|
|
||||||
for _, field := range result.Document.Fields {
|
|
||||||
s.getOrDefineField(field.Name())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
|
||||||
s.FieldsMap = make(map[string]uint16, len(s.FieldsInv))
|
|
||||||
for fieldID, fieldName := range s.FieldsInv {
|
|
||||||
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
|
||||||
}
|
|
||||||
|
|
||||||
processField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
|
||||||
for term, tf := range tfs {
|
|
||||||
pidPlus1, exists := s.Dicts[fieldID][term]
|
|
||||||
if !exists {
|
|
||||||
numPostingsLists++
|
|
||||||
pidPlus1 = uint64(numPostingsLists)
|
|
||||||
s.Dicts[fieldID][term] = pidPlus1
|
|
||||||
s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term)
|
|
||||||
numTermsPerPostingsList = append(numTermsPerPostingsList, 0)
|
|
||||||
numLocsPerPostingsList = append(numLocsPerPostingsList, 0)
|
|
||||||
}
|
|
||||||
pid := pidPlus1 - 1
|
|
||||||
numTermsPerPostingsList[pid] += 1
|
|
||||||
numLocsPerPostingsList[pid] += len(tf.Locations)
|
|
||||||
totLocs += len(tf.Locations)
|
|
||||||
}
|
|
||||||
numTokenFrequencies += len(tfs)
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, result := range results {
|
|
||||||
// walk each composite field
|
|
||||||
for _, field := range result.Document.CompositeFields {
|
|
||||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
|
||||||
_, tf := field.Analyze()
|
|
||||||
processField(fieldID, tf)
|
|
||||||
}
|
|
||||||
|
|
||||||
// walk each field
|
|
||||||
for i, field := range result.Document.Fields {
|
|
||||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
|
||||||
tf := result.Analyzed[i]
|
|
||||||
processField(fieldID, tf)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
s.Postings = make([]*roaring.Bitmap, numPostingsLists)
|
|
||||||
for i := 0; i < numPostingsLists; i++ {
|
|
||||||
s.Postings[i] = roaring.New()
|
|
||||||
}
|
|
||||||
s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists)
|
|
||||||
for i := 0; i < numPostingsLists; i++ {
|
|
||||||
s.PostingsLocs[i] = roaring.New()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Preallocate big, contiguous backing arrays.
|
|
||||||
auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos.
|
|
||||||
uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos.
|
|
||||||
float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms.
|
|
||||||
uint16Backing := make([]uint16, totLocs) // For sub-Locfields.
|
|
||||||
|
|
||||||
// Point top-level slices to the backing arrays.
|
|
||||||
s.Freqs = auint64Backing[0:numPostingsLists]
|
|
||||||
auint64Backing = auint64Backing[numPostingsLists:]
|
|
||||||
|
|
||||||
s.Norms = make([][]float32, numPostingsLists)
|
|
||||||
|
|
||||||
s.Locfields = make([][]uint16, numPostingsLists)
|
|
||||||
|
|
||||||
s.Locstarts = auint64Backing[0:numPostingsLists]
|
|
||||||
auint64Backing = auint64Backing[numPostingsLists:]
|
|
||||||
|
|
||||||
s.Locends = auint64Backing[0:numPostingsLists]
|
|
||||||
auint64Backing = auint64Backing[numPostingsLists:]
|
|
||||||
|
|
||||||
s.Locpos = auint64Backing[0:numPostingsLists]
|
|
||||||
auint64Backing = auint64Backing[numPostingsLists:]
|
|
||||||
|
|
||||||
s.Locarraypos = make([][][]uint64, numPostingsLists)
|
|
||||||
|
|
||||||
// Point sub-slices to the backing arrays.
|
|
||||||
for pid, numTerms := range numTermsPerPostingsList {
|
|
||||||
s.Freqs[pid] = uint64Backing[0:0]
|
|
||||||
uint64Backing = uint64Backing[numTerms:]
|
|
||||||
|
|
||||||
s.Norms[pid] = float32Backing[0:0]
|
|
||||||
float32Backing = float32Backing[numTerms:]
|
|
||||||
}
|
|
||||||
|
|
||||||
for pid, numLocs := range numLocsPerPostingsList {
|
|
||||||
s.Locfields[pid] = uint16Backing[0:0]
|
|
||||||
uint16Backing = uint16Backing[numLocs:]
|
|
||||||
|
|
||||||
s.Locstarts[pid] = uint64Backing[0:0]
|
|
||||||
uint64Backing = uint64Backing[numLocs:]
|
|
||||||
|
|
||||||
s.Locends[pid] = uint64Backing[0:0]
|
|
||||||
uint64Backing = uint64Backing[numLocs:]
|
|
||||||
|
|
||||||
s.Locpos[pid] = uint64Backing[0:0]
|
|
||||||
uint64Backing = uint64Backing[numLocs:]
|
|
||||||
|
|
||||||
s.Locarraypos[pid] = auint64Backing[0:0]
|
|
||||||
auint64Backing = auint64Backing[numLocs:]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Segment) processDocument(result *index.AnalysisResult) {
|
|
||||||
// used to collate information across fields
|
|
||||||
docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap))
|
|
||||||
fieldLens := make(map[uint16]int, len(s.FieldsMap))
|
|
||||||
|
|
||||||
docNum := uint64(s.addDocument())
|
|
||||||
|
|
||||||
processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) {
|
|
||||||
fieldLens[field] += l
|
|
||||||
if existingFreqs, ok := docMap[field]; ok {
|
|
||||||
existingFreqs.MergeAll(name, tf)
|
|
||||||
} else {
|
|
||||||
docMap[field] = tf
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) {
|
|
||||||
s.Stored[docNum][field] = append(s.Stored[docNum][field], val)
|
|
||||||
s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ)
|
|
||||||
s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos)
|
|
||||||
}
|
|
||||||
|
|
||||||
// walk each composite field
|
|
||||||
for _, field := range result.Document.CompositeFields {
|
|
||||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
|
||||||
l, tf := field.Analyze()
|
|
||||||
processField(fieldID, field.Name(), l, tf)
|
|
||||||
}
|
|
||||||
|
|
||||||
// walk each field
|
|
||||||
for i, field := range result.Document.Fields {
|
|
||||||
fieldID := uint16(s.getOrDefineField(field.Name()))
|
|
||||||
l := result.Length[i]
|
|
||||||
tf := result.Analyzed[i]
|
|
||||||
processField(fieldID, field.Name(), l, tf)
|
|
||||||
if field.Options().IsStored() {
|
|
||||||
storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions())
|
|
||||||
}
|
|
||||||
|
|
||||||
if field.Options().IncludeDocValues() {
|
|
||||||
s.DocValueFields[fieldID] = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// now that its been rolled up into docMap, walk that
|
|
||||||
for fieldID, tokenFrequencies := range docMap {
|
|
||||||
for term, tokenFreq := range tokenFrequencies {
|
|
||||||
pid := s.Dicts[fieldID][term] - 1
|
|
||||||
bs := s.Postings[pid]
|
|
||||||
bs.AddInt(int(docNum))
|
|
||||||
s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency()))
|
|
||||||
s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))))
|
|
||||||
locationBS := s.PostingsLocs[pid]
|
|
||||||
if len(tokenFreq.Locations) > 0 {
|
|
||||||
locationBS.AddInt(int(docNum))
|
|
||||||
for _, loc := range tokenFreq.Locations {
|
|
||||||
var locf = fieldID
|
|
||||||
if loc.Field != "" {
|
|
||||||
locf = uint16(s.getOrDefineField(loc.Field))
|
|
||||||
}
|
|
||||||
s.Locfields[pid] = append(s.Locfields[pid], locf)
|
|
||||||
s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start))
|
|
||||||
s.Locends[pid] = append(s.Locends[pid], uint64(loc.End))
|
|
||||||
s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position))
|
|
||||||
if len(loc.ArrayPositions) > 0 {
|
|
||||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions)
|
|
||||||
} else {
|
|
||||||
s.Locarraypos[pid] = append(s.Locarraypos[pid], nil)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Segment) getOrDefineField(name string) int {
|
|
||||||
fieldIDPlus1, ok := s.FieldsMap[name]
|
|
||||||
if !ok {
|
|
||||||
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
|
||||||
s.FieldsMap[name] = fieldIDPlus1
|
|
||||||
s.FieldsInv = append(s.FieldsInv, name)
|
|
||||||
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
|
||||||
s.DictKeys = append(s.DictKeys, make([]string, 0))
|
|
||||||
}
|
|
||||||
return int(fieldIDPlus1 - 1)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Segment) addDocument() int {
|
|
||||||
docNum := len(s.Stored)
|
|
||||||
s.Stored = append(s.Stored, map[uint16][][]byte{})
|
|
||||||
s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{})
|
|
||||||
s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{})
|
|
||||||
return docNum
|
|
||||||
}
|
|
||||||
|
|
||||||
func encodeFieldType(f document.Field) byte {
|
|
||||||
fieldType := byte('x')
|
|
||||||
switch f.(type) {
|
|
||||||
case *document.TextField:
|
|
||||||
fieldType = 't'
|
|
||||||
case *document.NumericField:
|
|
||||||
fieldType = 'n'
|
|
||||||
case *document.DateTimeField:
|
|
||||||
fieldType = 'd'
|
|
||||||
case *document.BooleanField:
|
|
||||||
fieldType = 'b'
|
|
||||||
case *document.GeoPointField:
|
|
||||||
fieldType = 'g'
|
|
||||||
case *document.CompositeField:
|
|
||||||
fieldType = 'c'
|
|
||||||
}
|
|
||||||
return fieldType
|
|
||||||
}
|
|
|
@ -1,103 +0,0 @@
|
||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package mem
|
|
||||||
|
|
||||||
import (
|
|
||||||
"sort"
|
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
|
||||||
"github.com/blevesearch/bleve/index"
|
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
|
||||||
)
|
|
||||||
|
|
||||||
// Dictionary is the in-memory representation of the term dictionary
|
|
||||||
type Dictionary struct {
|
|
||||||
segment *Segment
|
|
||||||
field string
|
|
||||||
fieldID uint16
|
|
||||||
}
|
|
||||||
|
|
||||||
// PostingsList returns the postings list for the specified term
|
|
||||||
func (d *Dictionary) PostingsList(term string,
|
|
||||||
except *roaring.Bitmap) (segment.PostingsList, error) {
|
|
||||||
return &PostingsList{
|
|
||||||
dictionary: d,
|
|
||||||
term: term,
|
|
||||||
postingsID: d.segment.Dicts[d.fieldID][term],
|
|
||||||
except: except,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Iterator returns an iterator for this dictionary
|
|
||||||
func (d *Dictionary) Iterator() segment.DictionaryIterator {
|
|
||||||
return &DictionaryIterator{
|
|
||||||
d: d,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// PrefixIterator returns an iterator which only visits terms having the
|
|
||||||
// the specified prefix
|
|
||||||
func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
|
||||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix)
|
|
||||||
return &DictionaryIterator{
|
|
||||||
d: d,
|
|
||||||
prefix: prefix,
|
|
||||||
offset: offset,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// RangeIterator returns an iterator which only visits terms between the
|
|
||||||
// start and end terms. NOTE: bleve.index API specifies the end is inclusive.
|
|
||||||
func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator {
|
|
||||||
offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start)
|
|
||||||
return &DictionaryIterator{
|
|
||||||
d: d,
|
|
||||||
offset: offset,
|
|
||||||
end: end,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// DictionaryIterator is an iterator for term dictionary
|
|
||||||
type DictionaryIterator struct {
|
|
||||||
d *Dictionary
|
|
||||||
prefix string
|
|
||||||
end string
|
|
||||||
offset int
|
|
||||||
|
|
||||||
dictEntry index.DictEntry // reused across Next()'s
|
|
||||||
}
|
|
||||||
|
|
||||||
// Next returns the next entry in the dictionary
|
|
||||||
func (d *DictionaryIterator) Next() (*index.DictEntry, error) {
|
|
||||||
if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
next := d.d.segment.DictKeys[d.d.fieldID][d.offset]
|
|
||||||
// check prefix
|
|
||||||
if d.prefix != "" && !strings.HasPrefix(next, d.prefix) {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
// check end (bleve.index API demands inclusive end)
|
|
||||||
if d.end != "" && next > d.end {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
d.offset++
|
|
||||||
postingID := d.d.segment.Dicts[d.d.fieldID][next]
|
|
||||||
d.dictEntry.Term = next
|
|
||||||
d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality()
|
|
||||||
return &d.dictEntry, nil
|
|
||||||
}
|
|
|
@ -1,178 +0,0 @@
|
||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package mem
|
|
||||||
|
|
||||||
import (
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
|
||||||
)
|
|
||||||
|
|
||||||
// PostingsList is an in-memory represenation of a postings list
|
|
||||||
type PostingsList struct {
|
|
||||||
dictionary *Dictionary
|
|
||||||
term string
|
|
||||||
postingsID uint64
|
|
||||||
except *roaring.Bitmap
|
|
||||||
}
|
|
||||||
|
|
||||||
// Count returns the number of items on this postings list
|
|
||||||
func (p *PostingsList) Count() uint64 {
|
|
||||||
var rv uint64
|
|
||||||
if p.postingsID > 0 {
|
|
||||||
rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality()
|
|
||||||
if p.except != nil {
|
|
||||||
except := p.except.GetCardinality()
|
|
||||||
if except > rv {
|
|
||||||
// avoid underflow
|
|
||||||
except = rv
|
|
||||||
}
|
|
||||||
rv -= except
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
// Iterator returns an iterator for this postings list
|
|
||||||
func (p *PostingsList) Iterator() segment.PostingsIterator {
|
|
||||||
rv := &PostingsIterator{
|
|
||||||
postings: p,
|
|
||||||
}
|
|
||||||
if p.postingsID > 0 {
|
|
||||||
allbits := p.dictionary.segment.Postings[p.postingsID-1]
|
|
||||||
rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1]
|
|
||||||
rv.all = allbits.Iterator()
|
|
||||||
if p.except != nil {
|
|
||||||
allExcept := allbits.Clone()
|
|
||||||
allExcept.AndNot(p.except)
|
|
||||||
rv.actual = allExcept.Iterator()
|
|
||||||
} else {
|
|
||||||
rv.actual = allbits.Iterator()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
// PostingsIterator provides a way to iterate through the postings list
|
|
||||||
type PostingsIterator struct {
|
|
||||||
postings *PostingsList
|
|
||||||
all roaring.IntIterable
|
|
||||||
locations *roaring.Bitmap
|
|
||||||
offset int
|
|
||||||
locoffset int
|
|
||||||
actual roaring.IntIterable
|
|
||||||
}
|
|
||||||
|
|
||||||
// Next returns the next posting on the postings list, or nil at the end
|
|
||||||
func (i *PostingsIterator) Next() (segment.Posting, error) {
|
|
||||||
if i.actual == nil || !i.actual.HasNext() {
|
|
||||||
return nil, nil
|
|
||||||
}
|
|
||||||
n := i.actual.Next()
|
|
||||||
allN := i.all.Next()
|
|
||||||
|
|
||||||
// n is the next actual hit (excluding some postings)
|
|
||||||
// allN is the next hit in the full postings
|
|
||||||
// if they don't match, adjust offsets to factor in item we're skipping over
|
|
||||||
// incr the all iterator, and check again
|
|
||||||
for allN != n {
|
|
||||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
|
||||||
i.offset++
|
|
||||||
allN = i.all.Next()
|
|
||||||
}
|
|
||||||
rv := &Posting{
|
|
||||||
iterator: i,
|
|
||||||
docNum: uint64(n),
|
|
||||||
offset: i.offset,
|
|
||||||
locoffset: i.locoffset,
|
|
||||||
hasLoc: i.locations.Contains(n),
|
|
||||||
}
|
|
||||||
|
|
||||||
i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset])
|
|
||||||
i.offset++
|
|
||||||
return rv, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Posting is a single entry in a postings list
|
|
||||||
type Posting struct {
|
|
||||||
iterator *PostingsIterator
|
|
||||||
docNum uint64
|
|
||||||
offset int
|
|
||||||
locoffset int
|
|
||||||
hasLoc bool
|
|
||||||
}
|
|
||||||
|
|
||||||
// Number returns the document number of this posting in this segment
|
|
||||||
func (p *Posting) Number() uint64 {
|
|
||||||
return p.docNum
|
|
||||||
}
|
|
||||||
|
|
||||||
// Frequency returns the frequence of occurance of this term in this doc/field
|
|
||||||
func (p *Posting) Frequency() uint64 {
|
|
||||||
return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Norm returns the normalization factor for this posting
|
|
||||||
func (p *Posting) Norm() float64 {
|
|
||||||
return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset])
|
|
||||||
}
|
|
||||||
|
|
||||||
// Locations returns the location information for each occurance
|
|
||||||
func (p *Posting) Locations() []segment.Location {
|
|
||||||
if !p.hasLoc {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
freq := int(p.Frequency())
|
|
||||||
rv := make([]segment.Location, freq)
|
|
||||||
for i := 0; i < freq; i++ {
|
|
||||||
rv[i] = &Location{
|
|
||||||
p: p,
|
|
||||||
offset: p.locoffset + i,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
// Location represents the location of a single occurance
|
|
||||||
type Location struct {
|
|
||||||
p *Posting
|
|
||||||
offset int
|
|
||||||
}
|
|
||||||
|
|
||||||
// Field returns the name of the field (useful in composite fields to know
|
|
||||||
// which original field the value came from)
|
|
||||||
func (l *Location) Field() string {
|
|
||||||
return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Start returns the start byte offset of this occurance
|
|
||||||
func (l *Location) Start() uint64 {
|
|
||||||
return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset]
|
|
||||||
}
|
|
||||||
|
|
||||||
// End returns the end byte offset of this occurance
|
|
||||||
func (l *Location) End() uint64 {
|
|
||||||
return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset]
|
|
||||||
}
|
|
||||||
|
|
||||||
// Pos returns the 1-based phrase position of this occurance
|
|
||||||
func (l *Location) Pos() uint64 {
|
|
||||||
return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset]
|
|
||||||
}
|
|
||||||
|
|
||||||
// ArrayPositions returns the array position vector associated with this occurance
|
|
||||||
func (l *Location) ArrayPositions() []uint64 {
|
|
||||||
return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset]
|
|
||||||
}
|
|
|
@ -1,289 +0,0 @@
|
||||||
// Copyright (c) 2017 Couchbase, Inc.
|
|
||||||
//
|
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
// you may not use this file except in compliance with the License.
|
|
||||||
// You may obtain a copy of the License at
|
|
||||||
//
|
|
||||||
// http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
//
|
|
||||||
// Unless required by applicable law or agreed to in writing, software
|
|
||||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
// See the License for the specific language governing permissions and
|
|
||||||
// limitations under the License.
|
|
||||||
|
|
||||||
package mem
|
|
||||||
|
|
||||||
import (
|
|
||||||
"fmt"
|
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
|
||||||
)
|
|
||||||
|
|
||||||
// _id field is always guaranteed to have fieldID of 0
|
|
||||||
const idFieldID uint16 = 0
|
|
||||||
|
|
||||||
// KNOWN ISSUES
|
|
||||||
// - LIMITATION - we decided whether or not to store term vectors for a field
|
|
||||||
// at the segment level, based on the first definition of a
|
|
||||||
// field we see. in normal bleve usage this is fine, all
|
|
||||||
// instances of a field definition will be the same. however,
|
|
||||||
// advanced users may violate this and provide unique field
|
|
||||||
// definitions with each document. this segment does not
|
|
||||||
// support this usage.
|
|
||||||
|
|
||||||
// TODO
|
|
||||||
// - need better testing of multiple docs, iterating freqs, locations and
|
|
||||||
// and verifying the correct results are returned
|
|
||||||
|
|
||||||
// Segment is an in memory implementation of scorch.Segment
|
|
||||||
type Segment struct {
|
|
||||||
|
|
||||||
// FieldsMap adds 1 to field id to avoid zero value issues
|
|
||||||
// name -> field id + 1
|
|
||||||
FieldsMap map[string]uint16
|
|
||||||
|
|
||||||
// FieldsInv is the inverse of FieldsMap
|
|
||||||
// field id -> name
|
|
||||||
FieldsInv []string
|
|
||||||
|
|
||||||
// Term dictionaries for each field
|
|
||||||
// field id -> term -> postings list id + 1
|
|
||||||
Dicts []map[string]uint64
|
|
||||||
|
|
||||||
// Terms for each field, where terms are sorted ascending
|
|
||||||
// field id -> []term
|
|
||||||
DictKeys [][]string
|
|
||||||
|
|
||||||
// Postings list
|
|
||||||
// postings list id -> bitmap by docNum
|
|
||||||
Postings []*roaring.Bitmap
|
|
||||||
|
|
||||||
// Postings list has locations
|
|
||||||
PostingsLocs []*roaring.Bitmap
|
|
||||||
|
|
||||||
// Term frequencies
|
|
||||||
// postings list id -> Freqs (one for each hit in bitmap)
|
|
||||||
Freqs [][]uint64
|
|
||||||
|
|
||||||
// Field norms
|
|
||||||
// postings list id -> Norms (one for each hit in bitmap)
|
|
||||||
Norms [][]float32
|
|
||||||
|
|
||||||
// Field/start/end/pos/locarraypos
|
|
||||||
// postings list id -> start/end/pos/locarraypos (one for each freq)
|
|
||||||
Locfields [][]uint16
|
|
||||||
Locstarts [][]uint64
|
|
||||||
Locends [][]uint64
|
|
||||||
Locpos [][]uint64
|
|
||||||
Locarraypos [][][]uint64
|
|
||||||
|
|
||||||
// Stored field values
|
|
||||||
// docNum -> field id -> slice of values (each value []byte)
|
|
||||||
Stored []map[uint16][][]byte
|
|
||||||
|
|
||||||
// Stored field types
|
|
||||||
// docNum -> field id -> slice of types (each type byte)
|
|
||||||
StoredTypes []map[uint16][]byte
|
|
||||||
|
|
||||||
// Stored field array positions
|
|
||||||
// docNum -> field id -> slice of array positions (each is []uint64)
|
|
||||||
StoredPos []map[uint16][][]uint64
|
|
||||||
|
|
||||||
// For storing the docValue persisted fields
|
|
||||||
DocValueFields map[uint16]bool
|
|
||||||
|
|
||||||
// Footprint of the segment, updated when analyzed document mutations
|
|
||||||
// are added into the segment
|
|
||||||
sizeInBytes uint64
|
|
||||||
}
|
|
||||||
|
|
||||||
// New builds a new empty Segment
|
|
||||||
func New() *Segment {
|
|
||||||
return &Segment{
|
|
||||||
FieldsMap: map[string]uint16{},
|
|
||||||
DocValueFields: map[uint16]bool{},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Segment) updateSizeInBytes() {
|
|
||||||
var sizeInBytes uint64
|
|
||||||
|
|
||||||
// FieldsMap, FieldsInv
|
|
||||||
for k, _ := range s.FieldsMap {
|
|
||||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
|
||||||
2 /* size of uint16 */)
|
|
||||||
}
|
|
||||||
// overhead from the data structures
|
|
||||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
|
||||||
|
|
||||||
// Dicts, DictKeys
|
|
||||||
for _, entry := range s.Dicts {
|
|
||||||
for k, _ := range entry {
|
|
||||||
sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 +
|
|
||||||
8 /* size of uint64 */)
|
|
||||||
}
|
|
||||||
// overhead from the data structures
|
|
||||||
sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice)
|
|
||||||
}
|
|
||||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
|
||||||
|
|
||||||
// Postings, PostingsLocs
|
|
||||||
for i := 0; i < len(s.Postings); i++ {
|
|
||||||
sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) +
|
|
||||||
(s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer)
|
|
||||||
}
|
|
||||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
|
||||||
|
|
||||||
// Freqs, Norms
|
|
||||||
for i := 0; i < len(s.Freqs); i++ {
|
|
||||||
sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ +
|
|
||||||
len(s.Norms[i])*4 /* size of float32 */) +
|
|
||||||
(segment.SizeOfSlice * 2)
|
|
||||||
}
|
|
||||||
sizeInBytes += (segment.SizeOfSlice * 2)
|
|
||||||
|
|
||||||
// Location data
|
|
||||||
for i := 0; i < len(s.Locfields); i++ {
|
|
||||||
sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ +
|
|
||||||
len(s.Locstarts[i])*8 /* size of uint64 */ +
|
|
||||||
len(s.Locends[i])*8 /* size of uint64 */ +
|
|
||||||
len(s.Locpos[i])*8 /* size of uint64 */)
|
|
||||||
|
|
||||||
for j := 0; j < len(s.Locarraypos[i]); j++ {
|
|
||||||
sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) +
|
|
||||||
segment.SizeOfSlice
|
|
||||||
}
|
|
||||||
|
|
||||||
sizeInBytes += (segment.SizeOfSlice * 5)
|
|
||||||
}
|
|
||||||
sizeInBytes += (segment.SizeOfSlice * 5)
|
|
||||||
|
|
||||||
// Stored data
|
|
||||||
for i := 0; i < len(s.Stored); i++ {
|
|
||||||
for _, v := range s.Stored[i] {
|
|
||||||
sizeInBytes += uint64(2 /* size of uint16 */)
|
|
||||||
for _, arr := range v {
|
|
||||||
sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice
|
|
||||||
}
|
|
||||||
sizeInBytes += segment.SizeOfSlice
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, v := range s.StoredTypes[i] {
|
|
||||||
sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, v := range s.StoredPos[i] {
|
|
||||||
sizeInBytes += uint64(2 /* size of uint16 */)
|
|
||||||
for _, arr := range v {
|
|
||||||
sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) +
|
|
||||||
segment.SizeOfSlice
|
|
||||||
}
|
|
||||||
sizeInBytes += segment.SizeOfSlice
|
|
||||||
}
|
|
||||||
|
|
||||||
// overhead from map(s) within Stored, StoredTypes, StoredPos
|
|
||||||
sizeInBytes += (segment.SizeOfMap * 3)
|
|
||||||
}
|
|
||||||
// overhead from data structures: Stored, StoredTypes, StoredPos
|
|
||||||
sizeInBytes += (segment.SizeOfSlice * 3)
|
|
||||||
|
|
||||||
// DocValueFields
|
|
||||||
sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) +
|
|
||||||
segment.SizeOfMap
|
|
||||||
|
|
||||||
// SizeInBytes
|
|
||||||
sizeInBytes += uint64(8)
|
|
||||||
|
|
||||||
s.sizeInBytes = sizeInBytes
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Segment) SizeInBytes() uint64 {
|
|
||||||
return s.sizeInBytes
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Segment) AddRef() {
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Segment) DecRef() error {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Fields returns the field names used in this segment
|
|
||||||
func (s *Segment) Fields() []string {
|
|
||||||
return s.FieldsInv
|
|
||||||
}
|
|
||||||
|
|
||||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
|
||||||
// for the specified doc number
|
|
||||||
func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
|
||||||
// ensure document number exists
|
|
||||||
if int(num) > len(s.Stored)-1 {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
docFields := s.Stored[int(num)]
|
|
||||||
st := s.StoredTypes[int(num)]
|
|
||||||
sp := s.StoredPos[int(num)]
|
|
||||||
for field, values := range docFields {
|
|
||||||
for i, value := range values {
|
|
||||||
keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i])
|
|
||||||
if !keepGoing {
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *Segment) getField(name string) (int, error) {
|
|
||||||
fieldID, ok := s.FieldsMap[name]
|
|
||||||
if !ok {
|
|
||||||
return 0, fmt.Errorf("no field named %s", name)
|
|
||||||
}
|
|
||||||
return int(fieldID - 1), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dictionary returns the term dictionary for the specified field
|
|
||||||
func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) {
|
|
||||||
fieldID, err := s.getField(field)
|
|
||||||
if err != nil {
|
|
||||||
// no such field, return empty dictionary
|
|
||||||
return &segment.EmptyDictionary{}, nil
|
|
||||||
}
|
|
||||||
return &Dictionary{
|
|
||||||
segment: s,
|
|
||||||
field: field,
|
|
||||||
fieldID: uint16(fieldID),
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Count returns the number of documents in this segment
|
|
||||||
// (this has no notion of deleted docs)
|
|
||||||
func (s *Segment) Count() uint64 {
|
|
||||||
return uint64(len(s.Stored))
|
|
||||||
}
|
|
||||||
|
|
||||||
// DocNumbers returns a bitset corresponding to the doc numbers of all the
|
|
||||||
// provided _id strings
|
|
||||||
func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
|
||||||
rv := roaring.New()
|
|
||||||
|
|
||||||
// guard against empty segment
|
|
||||||
if len(s.FieldsMap) > 0 {
|
|
||||||
idDictionary := s.Dicts[idFieldID]
|
|
||||||
|
|
||||||
for _, id := range ids {
|
|
||||||
postingID := idDictionary[id]
|
|
||||||
if postingID > 0 {
|
|
||||||
rv.Or(s.Postings[postingID-1])
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// Close releases all resources associated with this segment
|
|
||||||
func (s *Segment) Close() error {
|
|
||||||
return nil
|
|
||||||
}
|
|
75
vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go
generated
vendored
Normal file
75
vendor/github.com/blevesearch/bleve/index/scorch/segment/regexp.go
generated
vendored
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package segment
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp/syntax"
|
||||||
|
|
||||||
|
"github.com/couchbase/vellum/regexp"
|
||||||
|
)
|
||||||
|
|
||||||
|
func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) {
|
||||||
|
// TODO: potential optimization where syntax.Regexp supports a Simplify() API?
|
||||||
|
|
||||||
|
parsed, err := syntax.Parse(pattern, syntax.Perl)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
prefix := LiteralPrefix(parsed)
|
||||||
|
if prefix != "" {
|
||||||
|
prefixBeg := []byte(prefix)
|
||||||
|
prefixEnd := IncrementBytes(prefixBeg)
|
||||||
|
return re, prefixBeg, prefixEnd, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return re, nil, nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the literal prefix given the parse tree for a regexp
|
||||||
|
func LiteralPrefix(s *syntax.Regexp) string {
|
||||||
|
// traverse the left-most branch in the parse tree as long as the
|
||||||
|
// node represents a concatenation
|
||||||
|
for s != nil && s.Op == syntax.OpConcat {
|
||||||
|
if len(s.Sub) < 1 {
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
s = s.Sub[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.Op == syntax.OpLiteral {
|
||||||
|
return string(s.Rune)
|
||||||
|
}
|
||||||
|
|
||||||
|
return "" // no literal prefix
|
||||||
|
}
|
||||||
|
|
||||||
|
func IncrementBytes(in []byte) []byte {
|
||||||
|
rv := make([]byte, len(in))
|
||||||
|
copy(rv, in)
|
||||||
|
for i := len(rv) - 1; i >= 0; i-- {
|
||||||
|
rv[i] = rv[i] + 1
|
||||||
|
if rv[i] != 0 {
|
||||||
|
return rv // didn't overflow, so stop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil // overflowed
|
||||||
|
}
|
|
@ -15,15 +15,14 @@
|
||||||
package segment
|
package segment
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
|
"github.com/couchbase/vellum"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Overhead from go data structures when deployed on a 64-bit system.
|
var ErrClosed = fmt.Errorf("index closed")
|
||||||
const SizeOfMap uint64 = 8
|
|
||||||
const SizeOfPointer uint64 = 8
|
|
||||||
const SizeOfSlice uint64 = 24
|
|
||||||
const SizeOfString uint64 = 16
|
|
||||||
|
|
||||||
// DocumentFieldValueVisitor defines a callback to be visited for each
|
// DocumentFieldValueVisitor defines a callback to be visited for each
|
||||||
// stored field value. The return value determines if the visitor
|
// stored field value. The return value determines if the visitor
|
||||||
|
@ -34,6 +33,9 @@ type Segment interface {
|
||||||
Dictionary(field string) (TermDictionary, error)
|
Dictionary(field string) (TermDictionary, error)
|
||||||
|
|
||||||
VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error
|
VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error
|
||||||
|
|
||||||
|
DocID(num uint64) ([]byte, error)
|
||||||
|
|
||||||
Count() uint64
|
Count() uint64
|
||||||
|
|
||||||
DocNumbers([]string) (*roaring.Bitmap, error)
|
DocNumbers([]string) (*roaring.Bitmap, error)
|
||||||
|
@ -42,18 +44,21 @@ type Segment interface {
|
||||||
|
|
||||||
Close() error
|
Close() error
|
||||||
|
|
||||||
SizeInBytes() uint64
|
Size() int
|
||||||
|
|
||||||
AddRef()
|
AddRef()
|
||||||
DecRef() error
|
DecRef() error
|
||||||
}
|
}
|
||||||
|
|
||||||
type TermDictionary interface {
|
type TermDictionary interface {
|
||||||
PostingsList(term string, except *roaring.Bitmap) (PostingsList, error)
|
PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error)
|
||||||
|
|
||||||
Iterator() DictionaryIterator
|
Iterator() DictionaryIterator
|
||||||
PrefixIterator(prefix string) DictionaryIterator
|
PrefixIterator(prefix string) DictionaryIterator
|
||||||
RangeIterator(start, end string) DictionaryIterator
|
RangeIterator(start, end string) DictionaryIterator
|
||||||
|
AutomatonIterator(a vellum.Automaton,
|
||||||
|
startKeyInclusive, endKeyExclusive []byte) DictionaryIterator
|
||||||
|
OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator
|
||||||
}
|
}
|
||||||
|
|
||||||
type DictionaryIterator interface {
|
type DictionaryIterator interface {
|
||||||
|
@ -61,7 +66,9 @@ type DictionaryIterator interface {
|
||||||
}
|
}
|
||||||
|
|
||||||
type PostingsList interface {
|
type PostingsList interface {
|
||||||
Iterator() PostingsIterator
|
Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator
|
||||||
|
|
||||||
|
Size() int
|
||||||
|
|
||||||
Count() uint64
|
Count() uint64
|
||||||
|
|
||||||
|
@ -77,6 +84,14 @@ type PostingsIterator interface {
|
||||||
// implementations may return a shared instance to reduce memory
|
// implementations may return a shared instance to reduce memory
|
||||||
// allocations.
|
// allocations.
|
||||||
Next() (Posting, error)
|
Next() (Posting, error)
|
||||||
|
|
||||||
|
// Advance will return the posting with the specified doc number
|
||||||
|
// or if there is no such posting, the next posting.
|
||||||
|
// Callers MUST NOT attempt to pass a docNum that is less than or
|
||||||
|
// equal to the currently visited posting doc Num.
|
||||||
|
Advance(docNum uint64) (Posting, error)
|
||||||
|
|
||||||
|
Size() int
|
||||||
}
|
}
|
||||||
|
|
||||||
type Posting interface {
|
type Posting interface {
|
||||||
|
@ -86,6 +101,8 @@ type Posting interface {
|
||||||
Norm() float64
|
Norm() float64
|
||||||
|
|
||||||
Locations() []Location
|
Locations() []Location
|
||||||
|
|
||||||
|
Size() int
|
||||||
}
|
}
|
||||||
|
|
||||||
type Location interface {
|
type Location interface {
|
||||||
|
@ -94,6 +111,7 @@ type Location interface {
|
||||||
End() uint64
|
End() uint64
|
||||||
Pos() uint64
|
Pos() uint64
|
||||||
ArrayPositions() []uint64
|
ArrayPositions() []uint64
|
||||||
|
Size() int
|
||||||
}
|
}
|
||||||
|
|
||||||
// DocumentFieldTermVisitable is implemented by various scorch segment
|
// DocumentFieldTermVisitable is implemented by various scorch segment
|
||||||
|
@ -101,10 +119,17 @@ type Location interface {
|
||||||
// postings or other indexed values.
|
// postings or other indexed values.
|
||||||
type DocumentFieldTermVisitable interface {
|
type DocumentFieldTermVisitable interface {
|
||||||
VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||||
visitor index.DocumentFieldTermVisitor) error
|
visitor index.DocumentFieldTermVisitor, optional DocVisitState) (DocVisitState, error)
|
||||||
|
|
||||||
// VisitableDocValueFields implementation should return
|
// VisitableDocValueFields implementation should return
|
||||||
// the list of fields which are document value persisted and
|
// the list of fields which are document value persisted and
|
||||||
// therefore visitable by the above VisitDocumentFieldTerms method.
|
// therefore visitable by the above VisitDocumentFieldTerms method.
|
||||||
VisitableDocValueFields() ([]string, error)
|
VisitableDocValueFields() ([]string, error)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type DocVisitState interface {
|
||||||
|
}
|
||||||
|
|
||||||
|
type StatsReporter interface {
|
||||||
|
ReportBytesWritten(bytesWritten uint64)
|
||||||
|
}
|
||||||
|
|
|
@ -16,19 +16,13 @@ package zap
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
|
||||||
"encoding/binary"
|
|
||||||
"math"
|
"math"
|
||||||
"os"
|
"os"
|
||||||
"sort"
|
|
||||||
|
|
||||||
"github.com/Smerity/govarint"
|
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment/mem"
|
|
||||||
"github.com/couchbase/vellum"
|
|
||||||
"github.com/golang/snappy"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
const version uint32 = 3
|
const Version uint32 = 11
|
||||||
|
|
||||||
|
const Type string = "zap"
|
||||||
|
|
||||||
const fieldNotUninverted = math.MaxUint64
|
const fieldNotUninverted = math.MaxUint64
|
||||||
|
|
||||||
|
@ -82,219 +76,39 @@ func PersistSegmentBase(sb *SegmentBase, path string) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// PersistSegment takes the in-memory segment and persists it to
|
|
||||||
// the specified path in the zap file format.
|
|
||||||
func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error {
|
|
||||||
flag := os.O_RDWR | os.O_CREATE
|
|
||||||
|
|
||||||
f, err := os.OpenFile(path, flag, 0600)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
cleanup := func() {
|
|
||||||
_ = f.Close()
|
|
||||||
_ = os.Remove(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
// buffer the output
|
|
||||||
br := bufio.NewWriter(f)
|
|
||||||
|
|
||||||
// wrap it for counting (tracking offsets)
|
|
||||||
cr := NewCountHashWriter(br)
|
|
||||||
|
|
||||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err :=
|
|
||||||
persistBase(memSegment, cr, chunkFactor)
|
|
||||||
if err != nil {
|
|
||||||
cleanup()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset,
|
|
||||||
chunkFactor, cr.Sum32(), cr)
|
|
||||||
if err != nil {
|
|
||||||
cleanup()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
err = br.Flush()
|
|
||||||
if err != nil {
|
|
||||||
cleanup()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
err = f.Sync()
|
|
||||||
if err != nil {
|
|
||||||
cleanup()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
err = f.Close()
|
|
||||||
if err != nil {
|
|
||||||
cleanup()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) (
|
|
||||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
|
|
||||||
dictLocs []uint64, err error) {
|
|
||||||
docValueOffset = uint64(fieldNotUninverted)
|
|
||||||
|
|
||||||
if len(memSegment.Stored) > 0 {
|
|
||||||
storedIndexOffset, err = persistStored(memSegment, cr)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, 0, 0, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, 0, 0, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
postingsListLocs, err := persistPostingsLocs(memSegment, cr)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, 0, 0, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, 0, 0, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
dictLocs, err = persistDictionary(memSegment, cr, postingsLocs)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, 0, 0, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, 0, 0, nil, err
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
dictLocs = make([]uint64, len(memSegment.FieldsInv))
|
|
||||||
}
|
|
||||||
|
|
||||||
fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs)
|
|
||||||
if err != nil {
|
|
||||||
return 0, 0, 0, 0, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset,
|
|
||||||
dictLocs, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) {
|
|
||||||
var curr int
|
|
||||||
var metaBuf bytes.Buffer
|
|
||||||
var data, compressed []byte
|
|
||||||
|
|
||||||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
|
|
||||||
|
|
||||||
docNumOffsets := make(map[int]uint64, len(memSegment.Stored))
|
|
||||||
|
|
||||||
for docNum, storedValues := range memSegment.Stored {
|
|
||||||
if docNum != 0 {
|
|
||||||
// reset buffer if necessary
|
|
||||||
curr = 0
|
|
||||||
metaBuf.Reset()
|
|
||||||
data = data[:0]
|
|
||||||
compressed = compressed[:0]
|
|
||||||
}
|
|
||||||
|
|
||||||
st := memSegment.StoredTypes[docNum]
|
|
||||||
sp := memSegment.StoredPos[docNum]
|
|
||||||
|
|
||||||
// encode fields in order
|
|
||||||
for fieldID := range memSegment.FieldsInv {
|
|
||||||
if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok {
|
|
||||||
stf := st[uint16(fieldID)]
|
|
||||||
spf := sp[uint16(fieldID)]
|
|
||||||
|
|
||||||
var err2 error
|
|
||||||
curr, data, err2 = persistStoredFieldValues(fieldID,
|
|
||||||
storedFieldValues, stf, spf, curr, metaEncoder, data)
|
|
||||||
if err2 != nil {
|
|
||||||
return 0, err2
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
metaEncoder.Close()
|
|
||||||
metaBytes := metaBuf.Bytes()
|
|
||||||
|
|
||||||
// compress the data
|
|
||||||
compressed = snappy.Encode(compressed, data)
|
|
||||||
|
|
||||||
// record where we're about to start writing
|
|
||||||
docNumOffsets[docNum] = uint64(w.Count())
|
|
||||||
|
|
||||||
// write out the meta len and compressed data len
|
|
||||||
_, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
|
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// now write the meta
|
|
||||||
_, err = w.Write(metaBytes)
|
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
// now write the compressed data
|
|
||||||
_, err = w.Write(compressed)
|
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// return value is the start of the stored index
|
|
||||||
rv := uint64(w.Count())
|
|
||||||
// now write out the stored doc index
|
|
||||||
for docNum := range memSegment.Stored {
|
|
||||||
err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum])
|
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func persistStoredFieldValues(fieldID int,
|
func persistStoredFieldValues(fieldID int,
|
||||||
storedFieldValues [][]byte, stf []byte, spf [][]uint64,
|
storedFieldValues [][]byte, stf []byte, spf [][]uint64,
|
||||||
curr int, metaEncoder *govarint.Base128Encoder, data []byte) (
|
curr int, metaEncode varintEncoder, data []byte) (
|
||||||
int, []byte, error) {
|
int, []byte, error) {
|
||||||
for i := 0; i < len(storedFieldValues); i++ {
|
for i := 0; i < len(storedFieldValues); i++ {
|
||||||
// encode field
|
// encode field
|
||||||
_, err := metaEncoder.PutU64(uint64(fieldID))
|
_, err := metaEncode(uint64(fieldID))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, err
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
// encode type
|
// encode type
|
||||||
_, err = metaEncoder.PutU64(uint64(stf[i]))
|
_, err = metaEncode(uint64(stf[i]))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, err
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
// encode start offset
|
// encode start offset
|
||||||
_, err = metaEncoder.PutU64(uint64(curr))
|
_, err = metaEncode(uint64(curr))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, err
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
// end len
|
// end len
|
||||||
_, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i])))
|
_, err = metaEncode(uint64(len(storedFieldValues[i])))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, err
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
// encode number of array pos
|
// encode number of array pos
|
||||||
_, err = metaEncoder.PutU64(uint64(len(spf[i])))
|
_, err = metaEncode(uint64(len(spf[i])))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, err
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
// encode all array positions
|
// encode all array positions
|
||||||
for _, pos := range spf[i] {
|
for _, pos := range spf[i] {
|
||||||
_, err = metaEncoder.PutU64(pos)
|
_, err = metaEncode(pos)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, err
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
|
@ -307,337 +121,6 @@ func persistStoredFieldValues(fieldID int,
|
||||||
return curr, data, nil
|
return curr, data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) {
|
|
||||||
var freqOffsets, locOfffsets []uint64
|
|
||||||
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
|
||||||
for postingID := range memSegment.Postings {
|
|
||||||
if postingID != 0 {
|
|
||||||
tfEncoder.Reset()
|
|
||||||
}
|
|
||||||
freqs := memSegment.Freqs[postingID]
|
|
||||||
norms := memSegment.Norms[postingID]
|
|
||||||
postingsListItr := memSegment.Postings[postingID].Iterator()
|
|
||||||
var offset int
|
|
||||||
for postingsListItr.HasNext() {
|
|
||||||
|
|
||||||
docNum := uint64(postingsListItr.Next())
|
|
||||||
|
|
||||||
// put freq
|
|
||||||
err := tfEncoder.Add(docNum, freqs[offset])
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// put norm
|
|
||||||
norm := norms[offset]
|
|
||||||
normBits := math.Float32bits(norm)
|
|
||||||
err = tfEncoder.Add(docNum, uint64(normBits))
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
offset++
|
|
||||||
}
|
|
||||||
|
|
||||||
// record where this postings freq info starts
|
|
||||||
freqOffsets = append(freqOffsets, uint64(w.Count()))
|
|
||||||
|
|
||||||
tfEncoder.Close()
|
|
||||||
_, err := tfEncoder.Write(w)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// now do it again for the locations
|
|
||||||
locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
|
||||||
for postingID := range memSegment.Postings {
|
|
||||||
if postingID != 0 {
|
|
||||||
locEncoder.Reset()
|
|
||||||
}
|
|
||||||
freqs := memSegment.Freqs[postingID]
|
|
||||||
locfields := memSegment.Locfields[postingID]
|
|
||||||
locpos := memSegment.Locpos[postingID]
|
|
||||||
locstarts := memSegment.Locstarts[postingID]
|
|
||||||
locends := memSegment.Locends[postingID]
|
|
||||||
locarraypos := memSegment.Locarraypos[postingID]
|
|
||||||
postingsListItr := memSegment.Postings[postingID].Iterator()
|
|
||||||
var offset int
|
|
||||||
var locOffset int
|
|
||||||
for postingsListItr.HasNext() {
|
|
||||||
docNum := uint64(postingsListItr.Next())
|
|
||||||
for i := 0; i < int(freqs[offset]); i++ {
|
|
||||||
if len(locfields) > 0 {
|
|
||||||
// put field
|
|
||||||
err := locEncoder.Add(docNum, uint64(locfields[locOffset]))
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// put pos
|
|
||||||
err = locEncoder.Add(docNum, locpos[locOffset])
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// put start
|
|
||||||
err = locEncoder.Add(docNum, locstarts[locOffset])
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// put end
|
|
||||||
err = locEncoder.Add(docNum, locends[locOffset])
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// put the number of array positions to follow
|
|
||||||
num := len(locarraypos[locOffset])
|
|
||||||
err = locEncoder.Add(docNum, uint64(num))
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// put each array position
|
|
||||||
for _, pos := range locarraypos[locOffset] {
|
|
||||||
err = locEncoder.Add(docNum, pos)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
locOffset++
|
|
||||||
}
|
|
||||||
offset++
|
|
||||||
}
|
|
||||||
|
|
||||||
// record where this postings loc info starts
|
|
||||||
locOfffsets = append(locOfffsets, uint64(w.Count()))
|
|
||||||
locEncoder.Close()
|
|
||||||
_, err := locEncoder.Write(w)
|
|
||||||
if err != nil {
|
|
||||||
return nil, nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return freqOffsets, locOfffsets, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) {
|
|
||||||
rv = make([]uint64, 0, len(memSegment.PostingsLocs))
|
|
||||||
var reuseBuf bytes.Buffer
|
|
||||||
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
|
|
||||||
for postingID := range memSegment.PostingsLocs {
|
|
||||||
// record where we start this posting loc
|
|
||||||
rv = append(rv, uint64(w.Count()))
|
|
||||||
// write out the length and bitmap
|
|
||||||
_, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter,
|
|
||||||
postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) {
|
|
||||||
rv = make([]uint64, 0, len(memSegment.Postings))
|
|
||||||
var reuseBuf bytes.Buffer
|
|
||||||
reuseBufVarint := make([]byte, binary.MaxVarintLen64)
|
|
||||||
for postingID := range memSegment.Postings {
|
|
||||||
// record where we start this posting list
|
|
||||||
rv = append(rv, uint64(w.Count()))
|
|
||||||
|
|
||||||
// write out the term info, loc info, and loc posting list offset
|
|
||||||
_, err = writeUvarints(w, freqOffsets[postingID],
|
|
||||||
locOffsets[postingID], postingsListLocs[postingID])
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// write out the length and bitmap
|
|
||||||
_, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) {
|
|
||||||
rv := make([]uint64, 0, len(memSegment.DictKeys))
|
|
||||||
|
|
||||||
varintBuf := make([]byte, binary.MaxVarintLen64)
|
|
||||||
|
|
||||||
var buffer bytes.Buffer
|
|
||||||
for fieldID, fieldTerms := range memSegment.DictKeys {
|
|
||||||
if fieldID != 0 {
|
|
||||||
buffer.Reset()
|
|
||||||
}
|
|
||||||
|
|
||||||
// start a new vellum for this field
|
|
||||||
builder, err := vellum.New(&buffer, nil)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
dict := memSegment.Dicts[fieldID]
|
|
||||||
// now walk the dictionary in order of fieldTerms (already sorted)
|
|
||||||
for _, fieldTerm := range fieldTerms {
|
|
||||||
postingID := dict[fieldTerm] - 1
|
|
||||||
postingsAddr := postingsLocs[postingID]
|
|
||||||
err = builder.Insert([]byte(fieldTerm), postingsAddr)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
err = builder.Close()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// record where this dictionary starts
|
|
||||||
rv = append(rv, uint64(w.Count()))
|
|
||||||
|
|
||||||
vellumData := buffer.Bytes()
|
|
||||||
|
|
||||||
// write out the length of the vellum data
|
|
||||||
n := binary.PutUvarint(varintBuf, uint64(len(vellumData)))
|
|
||||||
_, err = w.Write(varintBuf[:n])
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
// write this vellum to disk
|
|
||||||
_, err = w.Write(vellumData)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
type docIDRange []uint64
|
|
||||||
|
|
||||||
func (a docIDRange) Len() int { return len(a) }
|
|
||||||
func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
|
|
||||||
func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] }
|
|
||||||
|
|
||||||
func persistDocValues(memSegment *mem.Segment, w *CountHashWriter,
|
|
||||||
chunkFactor uint32) (map[uint16]uint64, error) {
|
|
||||||
fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv))
|
|
||||||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1))
|
|
||||||
|
|
||||||
for fieldID := range memSegment.DocValueFields {
|
|
||||||
field := memSegment.FieldsInv[fieldID]
|
|
||||||
docTermMap := make(map[uint64][]byte, 0)
|
|
||||||
dict, err := memSegment.Dictionary(field)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
dictItr := dict.Iterator()
|
|
||||||
next, err := dictItr.Next()
|
|
||||||
for err == nil && next != nil {
|
|
||||||
postings, err1 := dict.PostingsList(next.Term, nil)
|
|
||||||
if err1 != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
postingsItr := postings.Iterator()
|
|
||||||
nextPosting, err2 := postingsItr.Next()
|
|
||||||
for err2 == nil && nextPosting != nil {
|
|
||||||
docNum := nextPosting.Number()
|
|
||||||
docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...)
|
|
||||||
docTermMap[docNum] = append(docTermMap[docNum], termSeparator)
|
|
||||||
nextPosting, err2 = postingsItr.Next()
|
|
||||||
}
|
|
||||||
if err2 != nil {
|
|
||||||
return nil, err2
|
|
||||||
}
|
|
||||||
|
|
||||||
next, err = dictItr.Next()
|
|
||||||
}
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
// sort wrt to docIDs
|
|
||||||
var docNumbers docIDRange
|
|
||||||
for k := range docTermMap {
|
|
||||||
docNumbers = append(docNumbers, k)
|
|
||||||
}
|
|
||||||
sort.Sort(docNumbers)
|
|
||||||
|
|
||||||
for _, docNum := range docNumbers {
|
|
||||||
err = fdvEncoder.Add(docNum, docTermMap[docNum])
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fieldChunkOffsets[fieldID] = uint64(w.Count())
|
|
||||||
err = fdvEncoder.Close()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
// persist the doc value details for this field
|
|
||||||
_, err = fdvEncoder.Write(w)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
// reseting encoder for the next field
|
|
||||||
fdvEncoder.Reset()
|
|
||||||
}
|
|
||||||
|
|
||||||
return fieldChunkOffsets, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter,
|
|
||||||
chunkFactor uint32) (uint64, error) {
|
|
||||||
fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor)
|
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
|
|
||||||
fieldDocValuesOffset := uint64(w.Count())
|
|
||||||
buf := make([]byte, binary.MaxVarintLen64)
|
|
||||||
offset := uint64(0)
|
|
||||||
ok := true
|
|
||||||
for fieldID := range memSegment.FieldsInv {
|
|
||||||
// if the field isn't configured for docValue, then mark
|
|
||||||
// the offset accordingly
|
|
||||||
if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok {
|
|
||||||
offset = fieldNotUninverted
|
|
||||||
}
|
|
||||||
n := binary.PutUvarint(buf, uint64(offset))
|
|
||||||
_, err := w.Write(buf[:n])
|
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return fieldDocValuesOffset, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) {
|
|
||||||
var br bytes.Buffer
|
|
||||||
|
|
||||||
cr := NewCountHashWriter(&br)
|
|
||||||
|
|
||||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err :=
|
|
||||||
persistBase(memSegment, cr, chunkFactor)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor,
|
|
||||||
memSegment.FieldsMap, memSegment.FieldsInv, numDocs,
|
|
||||||
storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs)
|
|
||||||
}
|
|
||||||
|
|
||||||
func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
|
func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
|
||||||
fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64,
|
fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64,
|
||||||
storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64,
|
storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64,
|
||||||
|
@ -653,10 +136,11 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32,
|
||||||
fieldsIndexOffset: fieldsIndexOffset,
|
fieldsIndexOffset: fieldsIndexOffset,
|
||||||
docValueOffset: docValueOffset,
|
docValueOffset: docValueOffset,
|
||||||
dictLocs: dictLocs,
|
dictLocs: dictLocs,
|
||||||
fieldDvIterMap: make(map[uint16]*docValueIterator),
|
fieldDvReaders: make(map[uint16]*docValueReader),
|
||||||
}
|
}
|
||||||
|
sb.updateSize()
|
||||||
|
|
||||||
err := sb.loadDvIterators()
|
err := sb.loadDvReaders()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,10 +18,18 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"io"
|
"io"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/golang/snappy"
|
"github.com/golang/snappy"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeMetaData int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var md MetaData
|
||||||
|
reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size())
|
||||||
|
}
|
||||||
|
|
||||||
var termSeparator byte = 0xff
|
var termSeparator byte = 0xff
|
||||||
var termSeparatorSplitSlice = []byte{termSeparator}
|
var termSeparatorSplitSlice = []byte{termSeparator}
|
||||||
|
|
||||||
|
@ -30,29 +38,36 @@ type chunkedContentCoder struct {
|
||||||
chunkSize uint64
|
chunkSize uint64
|
||||||
currChunk uint64
|
currChunk uint64
|
||||||
chunkLens []uint64
|
chunkLens []uint64
|
||||||
|
|
||||||
|
w io.Writer
|
||||||
|
progressiveWrite bool
|
||||||
|
|
||||||
chunkMetaBuf bytes.Buffer
|
chunkMetaBuf bytes.Buffer
|
||||||
chunkBuf bytes.Buffer
|
chunkBuf bytes.Buffer
|
||||||
|
|
||||||
chunkMeta []MetaData
|
chunkMeta []MetaData
|
||||||
|
|
||||||
|
compressed []byte // temp buf for snappy compression
|
||||||
}
|
}
|
||||||
|
|
||||||
// MetaData represents the data information inside a
|
// MetaData represents the data information inside a
|
||||||
// chunk.
|
// chunk.
|
||||||
type MetaData struct {
|
type MetaData struct {
|
||||||
DocNum uint64 // docNum of the data inside the chunk
|
DocNum uint64 // docNum of the data inside the chunk
|
||||||
DocDvLoc uint64 // starting offset for a given docid
|
DocDvOffset uint64 // offset of data inside the chunk for the given docid
|
||||||
DocDvLen uint64 // length of data inside the chunk for the given docid
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// newChunkedContentCoder returns a new chunk content coder which
|
// newChunkedContentCoder returns a new chunk content coder which
|
||||||
// packs data into chunks based on the provided chunkSize
|
// packs data into chunks based on the provided chunkSize
|
||||||
func newChunkedContentCoder(chunkSize uint64,
|
func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64,
|
||||||
maxDocNum uint64) *chunkedContentCoder {
|
w io.Writer, progressiveWrite bool) *chunkedContentCoder {
|
||||||
total := maxDocNum/chunkSize + 1
|
total := maxDocNum/chunkSize + 1
|
||||||
rv := &chunkedContentCoder{
|
rv := &chunkedContentCoder{
|
||||||
chunkSize: chunkSize,
|
chunkSize: chunkSize,
|
||||||
chunkLens: make([]uint64, total),
|
chunkLens: make([]uint64, total),
|
||||||
chunkMeta: make([]MetaData, 0, total),
|
chunkMeta: make([]MetaData, 0, total),
|
||||||
|
w: w,
|
||||||
|
progressiveWrite: progressiveWrite,
|
||||||
}
|
}
|
||||||
|
|
||||||
return rv
|
return rv
|
||||||
|
@ -88,7 +103,7 @@ func (c *chunkedContentCoder) flushContents() error {
|
||||||
|
|
||||||
// write out the metaData slice
|
// write out the metaData slice
|
||||||
for _, meta := range c.chunkMeta {
|
for _, meta := range c.chunkMeta {
|
||||||
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen)
|
_, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -98,10 +113,19 @@ func (c *chunkedContentCoder) flushContents() error {
|
||||||
metaData := c.chunkMetaBuf.Bytes()
|
metaData := c.chunkMetaBuf.Bytes()
|
||||||
c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
|
c.final = append(c.final, c.chunkMetaBuf.Bytes()...)
|
||||||
// write the compressed data to the final data
|
// write the compressed data to the final data
|
||||||
compressedData := snappy.Encode(nil, c.chunkBuf.Bytes())
|
c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes())
|
||||||
c.final = append(c.final, compressedData...)
|
c.final = append(c.final, c.compressed...)
|
||||||
|
|
||||||
|
c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData))
|
||||||
|
|
||||||
|
if c.progressiveWrite {
|
||||||
|
_, err := c.w.Write(c.final)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
c.final = c.final[:0]
|
||||||
|
}
|
||||||
|
|
||||||
c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData))
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -122,7 +146,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
|
||||||
c.currChunk = chunk
|
c.currChunk = chunk
|
||||||
}
|
}
|
||||||
|
|
||||||
// mark the starting offset for this doc
|
// get the starting offset for this doc
|
||||||
dvOffset := c.chunkBuf.Len()
|
dvOffset := c.chunkBuf.Len()
|
||||||
dvSize, err := c.chunkBuf.Write(vals)
|
dvSize, err := c.chunkBuf.Write(vals)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -131,37 +155,76 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error {
|
||||||
|
|
||||||
c.chunkMeta = append(c.chunkMeta, MetaData{
|
c.chunkMeta = append(c.chunkMeta, MetaData{
|
||||||
DocNum: docNum,
|
DocNum: docNum,
|
||||||
DocDvLoc: uint64(dvOffset),
|
DocDvOffset: uint64(dvOffset + dvSize),
|
||||||
DocDvLen: uint64(dvSize),
|
|
||||||
})
|
})
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write commits all the encoded chunked contents to the provided writer.
|
// Write commits all the encoded chunked contents to the provided writer.
|
||||||
func (c *chunkedContentCoder) Write(w io.Writer) (int, error) {
|
//
|
||||||
|
// | ..... data ..... | chunk offsets (varints)
|
||||||
|
// | position of chunk offsets (uint64) | number of offsets (uint64) |
|
||||||
|
//
|
||||||
|
func (c *chunkedContentCoder) Write() (int, error) {
|
||||||
var tw int
|
var tw int
|
||||||
buf := make([]byte, binary.MaxVarintLen64)
|
|
||||||
|
if c.final != nil {
|
||||||
|
// write out the data section first
|
||||||
|
nw, err := c.w.Write(c.final)
|
||||||
|
tw += nw
|
||||||
|
if err != nil {
|
||||||
|
return tw, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
chunkOffsetsStart := uint64(tw)
|
||||||
|
|
||||||
|
if cap(c.final) < binary.MaxVarintLen64 {
|
||||||
|
c.final = make([]byte, binary.MaxVarintLen64)
|
||||||
|
} else {
|
||||||
|
c.final = c.final[0:binary.MaxVarintLen64]
|
||||||
|
}
|
||||||
|
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
|
||||||
|
// write out the chunk offsets
|
||||||
|
for _, chunkOffset := range chunkOffsets {
|
||||||
|
n := binary.PutUvarint(c.final, chunkOffset)
|
||||||
|
nw, err := c.w.Write(c.final[:n])
|
||||||
|
tw += nw
|
||||||
|
if err != nil {
|
||||||
|
return tw, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
chunkOffsetsLen := uint64(tw) - chunkOffsetsStart
|
||||||
|
|
||||||
|
c.final = c.final[0:8]
|
||||||
|
// write out the length of chunk offsets
|
||||||
|
binary.BigEndian.PutUint64(c.final, chunkOffsetsLen)
|
||||||
|
nw, err := c.w.Write(c.final)
|
||||||
|
tw += nw
|
||||||
|
if err != nil {
|
||||||
|
return tw, err
|
||||||
|
}
|
||||||
|
|
||||||
// write out the number of chunks
|
// write out the number of chunks
|
||||||
n := binary.PutUvarint(buf, uint64(len(c.chunkLens)))
|
binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens)))
|
||||||
nw, err := w.Write(buf[:n])
|
nw, err = c.w.Write(c.final)
|
||||||
tw += nw
|
|
||||||
if err != nil {
|
|
||||||
return tw, err
|
|
||||||
}
|
|
||||||
// write out the chunk lens
|
|
||||||
for _, chunkLen := range c.chunkLens {
|
|
||||||
n := binary.PutUvarint(buf, uint64(chunkLen))
|
|
||||||
nw, err = w.Write(buf[:n])
|
|
||||||
tw += nw
|
|
||||||
if err != nil {
|
|
||||||
return tw, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// write out the data
|
|
||||||
nw, err = w.Write(c.final)
|
|
||||||
tw += nw
|
tw += nw
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return tw, err
|
return tw, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c.final = c.final[:0]
|
||||||
|
|
||||||
return tw, nil
|
return tw, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ReadDocValueBoundary elicits the start, end offsets from a
|
||||||
|
// metaData header slice
|
||||||
|
func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) {
|
||||||
|
var start uint64
|
||||||
|
if chunk > 0 {
|
||||||
|
start = metaHeaders[chunk-1].DocDvOffset
|
||||||
|
}
|
||||||
|
return start, metaHeaders[chunk].DocDvOffset
|
||||||
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@ package zap
|
||||||
import (
|
import (
|
||||||
"hash/crc32"
|
"hash/crc32"
|
||||||
"io"
|
"io"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
)
|
)
|
||||||
|
|
||||||
// CountHashWriter is a wrapper around a Writer which counts the number of
|
// CountHashWriter is a wrapper around a Writer which counts the number of
|
||||||
|
@ -25,6 +27,7 @@ type CountHashWriter struct {
|
||||||
w io.Writer
|
w io.Writer
|
||||||
crc uint32
|
crc uint32
|
||||||
n int
|
n int
|
||||||
|
s segment.StatsReporter
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
|
// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer
|
||||||
|
@ -32,11 +35,18 @@ func NewCountHashWriter(w io.Writer) *CountHashWriter {
|
||||||
return &CountHashWriter{w: w}
|
return &CountHashWriter{w: w}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter {
|
||||||
|
return &CountHashWriter{w: w, s: s}
|
||||||
|
}
|
||||||
|
|
||||||
// Write writes the provided bytes to the wrapped writer and counts the bytes
|
// Write writes the provided bytes to the wrapped writer and counts the bytes
|
||||||
func (c *CountHashWriter) Write(b []byte) (int, error) {
|
func (c *CountHashWriter) Write(b []byte) (int, error) {
|
||||||
n, err := c.w.Write(b)
|
n, err := c.w.Write(b)
|
||||||
c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n])
|
c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n])
|
||||||
c.n += n
|
c.n += n
|
||||||
|
if c.s != nil {
|
||||||
|
c.s.ReportBytesWritten(uint64(n))
|
||||||
|
}
|
||||||
return n, err
|
return n, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,13 +15,13 @@
|
||||||
package zap
|
package zap
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
"github.com/couchbase/vellum"
|
"github.com/couchbase/vellum"
|
||||||
"github.com/couchbase/vellum/regexp"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Dictionary is the zap representation of the term dictionary
|
// Dictionary is the zap representation of the term dictionary
|
||||||
|
@ -30,23 +30,36 @@ type Dictionary struct {
|
||||||
field string
|
field string
|
||||||
fieldID uint16
|
fieldID uint16
|
||||||
fst *vellum.FST
|
fst *vellum.FST
|
||||||
|
fstReader *vellum.Reader
|
||||||
}
|
}
|
||||||
|
|
||||||
// PostingsList returns the postings list for the specified term
|
// PostingsList returns the postings list for the specified term
|
||||||
func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
|
func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap,
|
||||||
return d.postingsList([]byte(term), except, nil)
|
prealloc segment.PostingsList) (segment.PostingsList, error) {
|
||||||
|
var preallocPL *PostingsList
|
||||||
|
pl, ok := prealloc.(*PostingsList)
|
||||||
|
if ok && pl != nil {
|
||||||
|
preallocPL = pl
|
||||||
|
}
|
||||||
|
return d.postingsList(term, except, preallocPL)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
|
func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) {
|
||||||
if d.fst == nil {
|
if d.fstReader == nil {
|
||||||
|
if rv == nil || rv == emptyPostingsList {
|
||||||
|
return emptyPostingsList, nil
|
||||||
|
}
|
||||||
return d.postingsListInit(rv, except), nil
|
return d.postingsListInit(rv, except), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
postingsOffset, exists, err := d.fst.Get(term)
|
postingsOffset, exists, err := d.fstReader.Get(term)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("vellum err: %v", err)
|
return nil, fmt.Errorf("vellum err: %v", err)
|
||||||
}
|
}
|
||||||
if !exists {
|
if !exists {
|
||||||
|
if rv == nil || rv == emptyPostingsList {
|
||||||
|
return emptyPostingsList, nil
|
||||||
|
}
|
||||||
return d.postingsListInit(rv, except), nil
|
return d.postingsListInit(rv, except), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,10 +78,17 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari
|
||||||
}
|
}
|
||||||
|
|
||||||
func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList {
|
func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList {
|
||||||
if rv == nil {
|
if rv == nil || rv == emptyPostingsList {
|
||||||
rv = &PostingsList{}
|
rv = &PostingsList{}
|
||||||
} else {
|
} else {
|
||||||
|
postings := rv.postings
|
||||||
|
if postings != nil {
|
||||||
|
postings.Clear()
|
||||||
|
}
|
||||||
|
|
||||||
*rv = PostingsList{} // clear the struct
|
*rv = PostingsList{} // clear the struct
|
||||||
|
|
||||||
|
rv.postings = postings
|
||||||
}
|
}
|
||||||
rv.sb = d.sb
|
rv.sb = d.sb
|
||||||
rv.except = except
|
rv.except = except
|
||||||
|
@ -85,6 +105,8 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator {
|
||||||
itr, err := d.fst.Iterator(nil, nil)
|
itr, err := d.fst.Iterator(nil, nil)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
rv.itr = itr
|
rv.itr = itr
|
||||||
|
} else if err != vellum.ErrIteratorDone {
|
||||||
|
rv.err = err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -98,13 +120,15 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator {
|
||||||
d: d,
|
d: d,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
kBeg := []byte(prefix)
|
||||||
|
kEnd := segment.IncrementBytes(kBeg)
|
||||||
|
|
||||||
if d.fst != nil {
|
if d.fst != nil {
|
||||||
r, err := regexp.New(prefix + ".*")
|
itr, err := d.fst.Iterator(kBeg, kEnd)
|
||||||
if err == nil {
|
|
||||||
itr, err := d.fst.Search(r, nil, nil)
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
rv.itr = itr
|
rv.itr = itr
|
||||||
}
|
} else if err != vellum.ErrIteratorDone {
|
||||||
|
rv.err = err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -130,36 +154,103 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator
|
||||||
itr, err := d.fst.Iterator([]byte(start), endBytes)
|
itr, err := d.fst.Iterator([]byte(start), endBytes)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
rv.itr = itr
|
rv.itr = itr
|
||||||
|
} else if err != vellum.ErrIteratorDone {
|
||||||
|
rv.err = err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AutomatonIterator returns an iterator which only visits terms
|
||||||
|
// having the the vellum automaton and start/end key range
|
||||||
|
func (d *Dictionary) AutomatonIterator(a vellum.Automaton,
|
||||||
|
startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator {
|
||||||
|
rv := &DictionaryIterator{
|
||||||
|
d: d,
|
||||||
|
}
|
||||||
|
|
||||||
|
if d.fst != nil {
|
||||||
|
itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive)
|
||||||
|
if err == nil {
|
||||||
|
rv.itr = itr
|
||||||
|
} else if err != vellum.ErrIteratorDone {
|
||||||
|
rv.err = err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (d *Dictionary) OnlyIterator(onlyTerms [][]byte,
|
||||||
|
includeCount bool) segment.DictionaryIterator {
|
||||||
|
|
||||||
|
rv := &DictionaryIterator{
|
||||||
|
d: d,
|
||||||
|
omitCount: !includeCount,
|
||||||
|
}
|
||||||
|
|
||||||
|
var buf bytes.Buffer
|
||||||
|
builder, err := vellum.New(&buf, nil)
|
||||||
|
if err != nil {
|
||||||
|
rv.err = err
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
for _, term := range onlyTerms {
|
||||||
|
err = builder.Insert(term, 0)
|
||||||
|
if err != nil {
|
||||||
|
rv.err = err
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
}
|
||||||
|
err = builder.Close()
|
||||||
|
if err != nil {
|
||||||
|
rv.err = err
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
onlyFST, err := vellum.Load(buf.Bytes())
|
||||||
|
if err != nil {
|
||||||
|
rv.err = err
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
itr, err := d.fst.Search(onlyFST, nil, nil)
|
||||||
|
if err == nil {
|
||||||
|
rv.itr = itr
|
||||||
|
} else if err != vellum.ErrIteratorDone {
|
||||||
|
rv.err = err
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
// DictionaryIterator is an iterator for term dictionary
|
// DictionaryIterator is an iterator for term dictionary
|
||||||
type DictionaryIterator struct {
|
type DictionaryIterator struct {
|
||||||
d *Dictionary
|
d *Dictionary
|
||||||
itr vellum.Iterator
|
itr vellum.Iterator
|
||||||
err error
|
err error
|
||||||
tmp PostingsList
|
tmp PostingsList
|
||||||
|
entry index.DictEntry
|
||||||
|
omitCount bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// Next returns the next entry in the dictionary
|
// Next returns the next entry in the dictionary
|
||||||
func (i *DictionaryIterator) Next() (*index.DictEntry, error) {
|
func (i *DictionaryIterator) Next() (*index.DictEntry, error) {
|
||||||
if i.itr == nil || i.err == vellum.ErrIteratorDone {
|
if i.err != nil && i.err != vellum.ErrIteratorDone {
|
||||||
return nil, nil
|
|
||||||
} else if i.err != nil {
|
|
||||||
return nil, i.err
|
return nil, i.err
|
||||||
|
} else if i.itr == nil || i.err == vellum.ErrIteratorDone {
|
||||||
|
return nil, nil
|
||||||
}
|
}
|
||||||
term, postingsOffset := i.itr.Current()
|
term, postingsOffset := i.itr.Current()
|
||||||
|
i.entry.Term = string(term)
|
||||||
|
if !i.omitCount {
|
||||||
i.err = i.tmp.read(postingsOffset, i.d)
|
i.err = i.tmp.read(postingsOffset, i.d)
|
||||||
if i.err != nil {
|
if i.err != nil {
|
||||||
return nil, i.err
|
return nil, i.err
|
||||||
}
|
}
|
||||||
rv := &index.DictEntry{
|
i.entry.Count = i.tmp.Count()
|
||||||
Term: string(term),
|
|
||||||
Count: i.tmp.Count(),
|
|
||||||
}
|
}
|
||||||
i.err = i.itr.Next()
|
i.err = i.itr.Next()
|
||||||
return rv, nil
|
return &i.entry, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,93 +19,129 @@ import (
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
"github.com/golang/snappy"
|
"github.com/golang/snappy"
|
||||||
)
|
)
|
||||||
|
|
||||||
type docValueIterator struct {
|
var reflectStaticSizedocValueReader int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var dvi docValueReader
|
||||||
|
reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size())
|
||||||
|
}
|
||||||
|
|
||||||
|
type docNumTermsVisitor func(docNum uint64, terms []byte) error
|
||||||
|
|
||||||
|
type docVisitState struct {
|
||||||
|
dvrs map[uint16]*docValueReader
|
||||||
|
segment *Segment
|
||||||
|
}
|
||||||
|
|
||||||
|
type docValueReader struct {
|
||||||
field string
|
field string
|
||||||
curChunkNum uint64
|
curChunkNum uint64
|
||||||
numChunks uint64
|
chunkOffsets []uint64
|
||||||
chunkLens []uint64
|
|
||||||
dvDataLoc uint64
|
dvDataLoc uint64
|
||||||
curChunkHeader []MetaData
|
curChunkHeader []MetaData
|
||||||
curChunkData []byte // compressed data cache
|
curChunkData []byte // compressed data cache
|
||||||
|
uncompressed []byte // temp buf for snappy decompression
|
||||||
}
|
}
|
||||||
|
|
||||||
func (di *docValueIterator) sizeInBytes() uint64 {
|
func (di *docValueReader) size() int {
|
||||||
// curChunkNum, numChunks, dvDataLoc --> uint64
|
return reflectStaticSizedocValueReader + size.SizeOfPtr +
|
||||||
sizeInBytes := 24
|
len(di.field) +
|
||||||
|
len(di.chunkOffsets)*size.SizeOfUint64 +
|
||||||
// field
|
len(di.curChunkHeader)*reflectStaticSizeMetaData +
|
||||||
sizeInBytes += (len(di.field) + int(segment.SizeOfString))
|
len(di.curChunkData)
|
||||||
|
|
||||||
// chunkLens, curChunkHeader
|
|
||||||
sizeInBytes += len(di.chunkLens)*8 +
|
|
||||||
len(di.curChunkHeader)*24 +
|
|
||||||
int(segment.SizeOfSlice*2) /* overhead from slices */
|
|
||||||
|
|
||||||
// curChunkData is mmap'ed, not included
|
|
||||||
|
|
||||||
return uint64(sizeInBytes)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (di *docValueIterator) fieldName() string {
|
func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader {
|
||||||
|
if rv == nil {
|
||||||
|
rv = &docValueReader{}
|
||||||
|
}
|
||||||
|
|
||||||
|
rv.field = di.field
|
||||||
|
rv.curChunkNum = math.MaxUint64
|
||||||
|
rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable
|
||||||
|
rv.dvDataLoc = di.dvDataLoc
|
||||||
|
rv.curChunkHeader = rv.curChunkHeader[:0]
|
||||||
|
rv.curChunkData = nil
|
||||||
|
rv.uncompressed = rv.uncompressed[:0]
|
||||||
|
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (di *docValueReader) fieldName() string {
|
||||||
return di.field
|
return di.field
|
||||||
}
|
}
|
||||||
|
|
||||||
func (di *docValueIterator) curChunkNumber() uint64 {
|
func (di *docValueReader) curChunkNumber() uint64 {
|
||||||
return di.curChunkNum
|
return di.curChunkNum
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SegmentBase) loadFieldDocValueIterator(field string,
|
func (s *SegmentBase) loadFieldDocValueReader(field string,
|
||||||
fieldDvLoc uint64) (*docValueIterator, error) {
|
fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) {
|
||||||
// get the docValue offset for the given fields
|
// get the docValue offset for the given fields
|
||||||
if fieldDvLoc == fieldNotUninverted {
|
if fieldDvLocStart == fieldNotUninverted {
|
||||||
return nil, fmt.Errorf("loadFieldDocValueIterator: "+
|
return nil, fmt.Errorf("loadFieldDocValueReader: "+
|
||||||
"no docValues found for field: %s", field)
|
"no docValues found for field: %s", field)
|
||||||
}
|
}
|
||||||
|
|
||||||
// read the number of chunks, chunk lengths
|
// read the number of chunks, and chunk offsets position
|
||||||
var offset, clen uint64
|
var numChunks, chunkOffsetsPosition uint64
|
||||||
numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64])
|
|
||||||
if read <= 0 {
|
|
||||||
return nil, fmt.Errorf("failed to read the field "+
|
|
||||||
"doc values for field %s", field)
|
|
||||||
}
|
|
||||||
offset += uint64(read)
|
|
||||||
|
|
||||||
fdvIter := &docValueIterator{
|
if fieldDvLocEnd-fieldDvLocStart > 16 {
|
||||||
|
numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd])
|
||||||
|
// read the length of chunk offsets
|
||||||
|
chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8])
|
||||||
|
// acquire position of chunk offsets
|
||||||
|
chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen
|
||||||
|
}
|
||||||
|
|
||||||
|
fdvIter := &docValueReader{
|
||||||
curChunkNum: math.MaxUint64,
|
curChunkNum: math.MaxUint64,
|
||||||
field: field,
|
field: field,
|
||||||
chunkLens: make([]uint64, int(numChunks)),
|
chunkOffsets: make([]uint64, int(numChunks)),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// read the chunk offsets
|
||||||
|
var offset uint64
|
||||||
for i := 0; i < int(numChunks); i++ {
|
for i := 0; i < int(numChunks); i++ {
|
||||||
clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64])
|
loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64])
|
||||||
if read <= 0 {
|
if read <= 0 {
|
||||||
return nil, fmt.Errorf("corrupted chunk length during segment load")
|
return nil, fmt.Errorf("corrupted chunk offset during segment load")
|
||||||
}
|
}
|
||||||
fdvIter.chunkLens[i] = clen
|
fdvIter.chunkOffsets[i] = loc
|
||||||
offset += uint64(read)
|
offset += uint64(read)
|
||||||
}
|
}
|
||||||
|
|
||||||
fdvIter.dvDataLoc = fieldDvLoc + offset
|
// set the data offset
|
||||||
|
fdvIter.dvDataLoc = fieldDvLocStart
|
||||||
|
|
||||||
return fdvIter, nil
|
return fdvIter, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (di *docValueIterator) loadDvChunk(chunkNumber,
|
func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error {
|
||||||
localDocNum uint64, s *SegmentBase) error {
|
|
||||||
// advance to the chunk where the docValues
|
// advance to the chunk where the docValues
|
||||||
// reside for the given docNum
|
// reside for the given docNum
|
||||||
destChunkDataLoc := di.dvDataLoc
|
destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc
|
||||||
for i := 0; i < int(chunkNumber); i++ {
|
start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets)
|
||||||
destChunkDataLoc += di.chunkLens[i]
|
if start >= end {
|
||||||
|
di.curChunkHeader = di.curChunkHeader[:0]
|
||||||
|
di.curChunkData = nil
|
||||||
|
di.curChunkNum = chunkNumber
|
||||||
|
di.uncompressed = di.uncompressed[:0]
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
curChunkSize := di.chunkLens[chunkNumber]
|
destChunkDataLoc += start
|
||||||
|
curChunkEnd += end
|
||||||
|
|
||||||
// read the number of docs reside in the chunk
|
// read the number of docs reside in the chunk
|
||||||
numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
|
numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64])
|
||||||
if read <= 0 {
|
if read <= 0 {
|
||||||
|
@ -114,38 +150,81 @@ func (di *docValueIterator) loadDvChunk(chunkNumber,
|
||||||
chunkMetaLoc := destChunkDataLoc + uint64(read)
|
chunkMetaLoc := destChunkDataLoc + uint64(read)
|
||||||
|
|
||||||
offset := uint64(0)
|
offset := uint64(0)
|
||||||
|
if cap(di.curChunkHeader) < int(numDocs) {
|
||||||
di.curChunkHeader = make([]MetaData, int(numDocs))
|
di.curChunkHeader = make([]MetaData, int(numDocs))
|
||||||
|
} else {
|
||||||
|
di.curChunkHeader = di.curChunkHeader[:int(numDocs)]
|
||||||
|
}
|
||||||
for i := 0; i < int(numDocs); i++ {
|
for i := 0; i < int(numDocs); i++ {
|
||||||
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||||
offset += uint64(read)
|
offset += uint64(read)
|
||||||
di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
||||||
offset += uint64(read)
|
|
||||||
di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64])
|
|
||||||
offset += uint64(read)
|
offset += uint64(read)
|
||||||
}
|
}
|
||||||
|
|
||||||
compressedDataLoc := chunkMetaLoc + offset
|
compressedDataLoc := chunkMetaLoc + offset
|
||||||
dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc
|
dataLength := curChunkEnd - compressedDataLoc
|
||||||
di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
|
di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength]
|
||||||
di.curChunkNum = chunkNumber
|
di.curChunkNum = chunkNumber
|
||||||
|
di.uncompressed = di.uncompressed[:0]
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (di *docValueIterator) visitDocValues(docNum uint64,
|
func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error {
|
||||||
visitor index.DocumentFieldTermVisitor) error {
|
for i := 0; i < len(di.chunkOffsets); i++ {
|
||||||
// binary search the term locations for the docNum
|
err := di.loadDvChunk(uint64(i), s)
|
||||||
start, length := di.getDocValueLocs(docNum)
|
if err != nil {
|
||||||
if start == math.MaxUint64 || length == math.MaxUint64 {
|
return err
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
if di.curChunkData == nil || len(di.curChunkHeader) == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
// uncompress the already loaded data
|
// uncompress the already loaded data
|
||||||
uncompressed, err := snappy.Decode(nil, di.curChunkData)
|
uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
di.uncompressed = uncompressed
|
||||||
|
|
||||||
|
start := uint64(0)
|
||||||
|
for _, entry := range di.curChunkHeader {
|
||||||
|
err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
start = entry.DocDvOffset
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (di *docValueReader) visitDocValues(docNum uint64,
|
||||||
|
visitor index.DocumentFieldTermVisitor) error {
|
||||||
|
// binary search the term locations for the docNum
|
||||||
|
start, end := di.getDocValueLocs(docNum)
|
||||||
|
if start == math.MaxUint64 || end == math.MaxUint64 || start == end {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var uncompressed []byte
|
||||||
|
var err error
|
||||||
|
// use the uncompressed copy if available
|
||||||
|
if len(di.uncompressed) > 0 {
|
||||||
|
uncompressed = di.uncompressed
|
||||||
|
} else {
|
||||||
|
// uncompress the already loaded data
|
||||||
|
uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
di.uncompressed = uncompressed
|
||||||
|
}
|
||||||
|
|
||||||
// pick the terms for the given docNum
|
// pick the terms for the given docNum
|
||||||
uncompressed = uncompressed[start : start+length]
|
uncompressed = uncompressed[start:end]
|
||||||
for {
|
for {
|
||||||
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
|
i := bytes.Index(uncompressed, termSeparatorSplitSlice)
|
||||||
if i < 0 {
|
if i < 0 {
|
||||||
|
@ -159,55 +238,72 @@ func (di *docValueIterator) visitDocValues(docNum uint64,
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) {
|
func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) {
|
||||||
i := sort.Search(len(di.curChunkHeader), func(i int) bool {
|
i := sort.Search(len(di.curChunkHeader), func(i int) bool {
|
||||||
return di.curChunkHeader[i].DocNum >= docNum
|
return di.curChunkHeader[i].DocNum >= docNum
|
||||||
})
|
})
|
||||||
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
|
if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum {
|
||||||
return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen
|
return ReadDocValueBoundary(i, di.curChunkHeader)
|
||||||
}
|
}
|
||||||
return math.MaxUint64, math.MaxUint64
|
return math.MaxUint64, math.MaxUint64
|
||||||
}
|
}
|
||||||
|
|
||||||
// VisitDocumentFieldTerms is an implementation of the
|
// VisitDocumentFieldTerms is an implementation of the
|
||||||
// DocumentFieldTermVisitable interface
|
// DocumentFieldTermVisitable interface
|
||||||
func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
||||||
visitor index.DocumentFieldTermVisitor) error {
|
visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) (
|
||||||
fieldIDPlus1 := uint16(0)
|
segment.DocVisitState, error) {
|
||||||
ok := true
|
dvs, ok := dvsIn.(*docVisitState)
|
||||||
|
if !ok || dvs == nil {
|
||||||
|
dvs = &docVisitState{}
|
||||||
|
} else {
|
||||||
|
if dvs.segment != s {
|
||||||
|
dvs.segment = s
|
||||||
|
dvs.dvrs = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var fieldIDPlus1 uint16
|
||||||
|
if dvs.dvrs == nil {
|
||||||
|
dvs.dvrs = make(map[uint16]*docValueReader, len(fields))
|
||||||
for _, field := range fields {
|
for _, field := range fields {
|
||||||
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
|
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
fieldID := fieldIDPlus1 - 1
|
||||||
|
if dvIter, exists := s.fieldDvReaders[fieldID]; exists &&
|
||||||
|
dvIter != nil {
|
||||||
|
dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// find the chunkNumber where the docValues are stored
|
// find the chunkNumber where the docValues are stored
|
||||||
docInChunk := localDocNum / uint64(s.chunkFactor)
|
docInChunk := localDocNum / uint64(s.chunkFactor)
|
||||||
|
var dvr *docValueReader
|
||||||
if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists &&
|
for _, field := range fields {
|
||||||
dvIter != nil {
|
if fieldIDPlus1, ok = s.fieldsMap[field]; !ok {
|
||||||
// check if the chunk is already loaded
|
|
||||||
if docInChunk != dvIter.curChunkNumber() {
|
|
||||||
err := dvIter.loadDvChunk(docInChunk, localDocNum, s)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
fieldID := fieldIDPlus1 - 1
|
||||||
|
if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil {
|
||||||
|
// check if the chunk is already loaded
|
||||||
|
if docInChunk != dvr.curChunkNumber() {
|
||||||
|
err := dvr.loadDvChunk(docInChunk, &s.SegmentBase)
|
||||||
|
if err != nil {
|
||||||
|
return dvs, err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
_ = dvIter.visitDocValues(localDocNum, visitor)
|
_ = dvr.visitDocValues(localDocNum, visitor)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return dvs, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// VisitableDocValueFields returns the list of fields with
|
// VisitableDocValueFields returns the list of fields with
|
||||||
// persisted doc value terms ready to be visitable using the
|
// persisted doc value terms ready to be visitable using the
|
||||||
// VisitDocumentFieldTerms method.
|
// VisitDocumentFieldTerms method.
|
||||||
func (s *Segment) VisitableDocValueFields() ([]string, error) {
|
func (s *Segment) VisitableDocValueFields() ([]string, error) {
|
||||||
var rv []string
|
return s.fieldDvNames, nil
|
||||||
for fieldID, field := range s.fieldsInv {
|
|
||||||
if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok &&
|
|
||||||
dvIter != nil {
|
|
||||||
rv = append(rv, field)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv, nil
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,26 +46,27 @@ func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) {
|
||||||
for i, itr := range rv.itrs {
|
for i, itr := range rv.itrs {
|
||||||
rv.currKs[i], rv.currVs[i] = itr.Current()
|
rv.currKs[i], rv.currVs[i] = itr.Current()
|
||||||
}
|
}
|
||||||
rv.updateMatches()
|
rv.updateMatches(false)
|
||||||
if rv.lowK == nil {
|
if rv.lowK == nil && len(rv.lowIdxs) == 0 {
|
||||||
return rv, vellum.ErrIteratorDone
|
return rv, vellum.ErrIteratorDone
|
||||||
}
|
}
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// updateMatches maintains the low key matches based on the currKs
|
// updateMatches maintains the low key matches based on the currKs
|
||||||
func (m *enumerator) updateMatches() {
|
func (m *enumerator) updateMatches(skipEmptyKey bool) {
|
||||||
m.lowK = nil
|
m.lowK = nil
|
||||||
m.lowIdxs = m.lowIdxs[:0]
|
m.lowIdxs = m.lowIdxs[:0]
|
||||||
m.lowCurr = 0
|
m.lowCurr = 0
|
||||||
|
|
||||||
for i, key := range m.currKs {
|
for i, key := range m.currKs {
|
||||||
if key == nil {
|
if (key == nil && m.currVs[i] == 0) || // in case of empty iterator
|
||||||
|
(len(key) == 0 && skipEmptyKey) { // skip empty keys
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
cmp := bytes.Compare(key, m.lowK)
|
cmp := bytes.Compare(key, m.lowK)
|
||||||
if cmp < 0 || m.lowK == nil {
|
if cmp < 0 || len(m.lowIdxs) == 0 {
|
||||||
// reached a new low
|
// reached a new low
|
||||||
m.lowK = key
|
m.lowK = key
|
||||||
m.lowIdxs = m.lowIdxs[:0]
|
m.lowIdxs = m.lowIdxs[:0]
|
||||||
|
@ -102,9 +103,10 @@ func (m *enumerator) Next() error {
|
||||||
}
|
}
|
||||||
m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current()
|
m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current()
|
||||||
}
|
}
|
||||||
m.updateMatches()
|
// can skip any empty keys encountered at this point
|
||||||
|
m.updateMatches(true)
|
||||||
}
|
}
|
||||||
if m.lowK == nil {
|
if m.lowK == nil && len(m.lowIdxs) == 0 {
|
||||||
return vellum.ErrIteratorDone
|
return vellum.ErrIteratorDone
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|
|
@ -18,16 +18,12 @@ import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"io"
|
"io"
|
||||||
|
|
||||||
"github.com/Smerity/govarint"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type chunkedIntCoder struct {
|
type chunkedIntCoder struct {
|
||||||
final []byte
|
final []byte
|
||||||
maxDocNum uint64
|
|
||||||
chunkSize uint64
|
chunkSize uint64
|
||||||
chunkBuf bytes.Buffer
|
chunkBuf bytes.Buffer
|
||||||
encoder *govarint.Base128Encoder
|
|
||||||
chunkLens []uint64
|
chunkLens []uint64
|
||||||
currChunk uint64
|
currChunk uint64
|
||||||
|
|
||||||
|
@ -41,11 +37,9 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder {
|
||||||
total := maxDocNum/chunkSize + 1
|
total := maxDocNum/chunkSize + 1
|
||||||
rv := &chunkedIntCoder{
|
rv := &chunkedIntCoder{
|
||||||
chunkSize: chunkSize,
|
chunkSize: chunkSize,
|
||||||
maxDocNum: maxDocNum,
|
|
||||||
chunkLens: make([]uint64, total),
|
chunkLens: make([]uint64, total),
|
||||||
final: make([]byte, 0, 64),
|
final: make([]byte, 0, 64),
|
||||||
}
|
}
|
||||||
rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf)
|
|
||||||
|
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
@ -67,16 +61,18 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
|
||||||
chunk := docNum / c.chunkSize
|
chunk := docNum / c.chunkSize
|
||||||
if chunk != c.currChunk {
|
if chunk != c.currChunk {
|
||||||
// starting a new chunk
|
// starting a new chunk
|
||||||
if c.encoder != nil {
|
|
||||||
// close out last
|
|
||||||
c.Close()
|
c.Close()
|
||||||
c.chunkBuf.Reset()
|
c.chunkBuf.Reset()
|
||||||
}
|
|
||||||
c.currChunk = chunk
|
c.currChunk = chunk
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(c.buf) < binary.MaxVarintLen64 {
|
||||||
|
c.buf = make([]byte, binary.MaxVarintLen64)
|
||||||
|
}
|
||||||
|
|
||||||
for _, val := range vals {
|
for _, val := range vals {
|
||||||
_, err := c.encoder.PutU64(val)
|
wb := binary.PutUvarint(c.buf, val)
|
||||||
|
_, err := c.chunkBuf.Write(c.buf[:wb])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -85,13 +81,26 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error {
|
||||||
|
chunk := docNum / c.chunkSize
|
||||||
|
if chunk != c.currChunk {
|
||||||
|
// starting a new chunk
|
||||||
|
c.Close()
|
||||||
|
c.chunkBuf.Reset()
|
||||||
|
c.currChunk = chunk
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err := c.chunkBuf.Write(buf)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// Close indicates you are done calling Add() this allows the final chunk
|
// Close indicates you are done calling Add() this allows the final chunk
|
||||||
// to be encoded.
|
// to be encoded.
|
||||||
func (c *chunkedIntCoder) Close() {
|
func (c *chunkedIntCoder) Close() {
|
||||||
c.encoder.Close()
|
|
||||||
encodingBytes := c.chunkBuf.Bytes()
|
encodingBytes := c.chunkBuf.Bytes()
|
||||||
c.chunkLens[c.currChunk] = uint64(len(encodingBytes))
|
c.chunkLens[c.currChunk] = uint64(len(encodingBytes))
|
||||||
c.final = append(c.final, encodingBytes...)
|
c.final = append(c.final, encodingBytes...)
|
||||||
|
c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write commits all the encoded chunked integers to the provided writer.
|
// Write commits all the encoded chunked integers to the provided writer.
|
||||||
|
@ -102,10 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
|
||||||
}
|
}
|
||||||
buf := c.buf
|
buf := c.buf
|
||||||
|
|
||||||
// write out the number of chunks & each chunkLen
|
// convert the chunk lengths into chunk offsets
|
||||||
n := binary.PutUvarint(buf, uint64(len(c.chunkLens)))
|
chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens)
|
||||||
for _, chunkLen := range c.chunkLens {
|
|
||||||
n += binary.PutUvarint(buf[n:], uint64(chunkLen))
|
// write out the number of chunks & each chunk offsets
|
||||||
|
n := binary.PutUvarint(buf, uint64(len(chunkOffsets)))
|
||||||
|
for _, chunkOffset := range chunkOffsets {
|
||||||
|
n += binary.PutUvarint(buf[n:], chunkOffset)
|
||||||
}
|
}
|
||||||
|
|
||||||
tw, err := w.Write(buf[:n])
|
tw, err := w.Write(buf[:n])
|
||||||
|
@ -121,3 +133,40 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) {
|
||||||
}
|
}
|
||||||
return tw, nil
|
return tw, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *chunkedIntCoder) FinalSize() int {
|
||||||
|
return len(c.final)
|
||||||
|
}
|
||||||
|
|
||||||
|
// modifyLengthsToEndOffsets converts the chunk length array
|
||||||
|
// to a chunk offset array. The readChunkBoundary
|
||||||
|
// will figure out the start and end of every chunk from
|
||||||
|
// these offsets. Starting offset of i'th index is stored
|
||||||
|
// in i-1'th position except for 0'th index and ending offset
|
||||||
|
// is stored at i'th index position.
|
||||||
|
// For 0'th element, starting position is always zero.
|
||||||
|
// eg:
|
||||||
|
// Lens -> 5 5 5 5 => 5 10 15 20
|
||||||
|
// Lens -> 0 5 0 5 => 0 5 5 10
|
||||||
|
// Lens -> 0 0 0 5 => 0 0 0 5
|
||||||
|
// Lens -> 5 0 0 0 => 5 5 5 5
|
||||||
|
// Lens -> 0 5 0 0 => 0 5 5 5
|
||||||
|
// Lens -> 0 0 5 0 => 0 0 5 5
|
||||||
|
func modifyLengthsToEndOffsets(lengths []uint64) []uint64 {
|
||||||
|
var runningOffset uint64
|
||||||
|
var index, i int
|
||||||
|
for i = 1; i <= len(lengths); i++ {
|
||||||
|
runningOffset += lengths[i-1]
|
||||||
|
lengths[index] = runningOffset
|
||||||
|
index++
|
||||||
|
}
|
||||||
|
return lengths
|
||||||
|
}
|
||||||
|
|
||||||
|
func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) {
|
||||||
|
var start uint64
|
||||||
|
if chunk > 0 {
|
||||||
|
start = offsets[chunk-1]
|
||||||
|
}
|
||||||
|
return start, offsets[chunk]
|
||||||
|
}
|
||||||
|
|
|
@ -24,11 +24,13 @@ import (
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
"github.com/Smerity/govarint"
|
seg "github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
"github.com/couchbase/vellum"
|
"github.com/couchbase/vellum"
|
||||||
"github.com/golang/snappy"
|
"github.com/golang/snappy"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var DefaultFileMergerBufferSize = 1024 * 1024
|
||||||
|
|
||||||
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
|
const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
|
||||||
|
|
||||||
// Merge takes a slice of zap segments and bit masks describing which
|
// Merge takes a slice of zap segments and bit masks describing which
|
||||||
|
@ -36,12 +38,24 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc
|
||||||
// remaining data. This new segment is built at the specified path,
|
// remaining data. This new segment is built at the specified path,
|
||||||
// with the provided chunkFactor.
|
// with the provided chunkFactor.
|
||||||
func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
|
func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
|
||||||
chunkFactor uint32) ([][]uint64, error) {
|
chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) (
|
||||||
|
[][]uint64, uint64, error) {
|
||||||
|
segmentBases := make([]*SegmentBase, len(segments))
|
||||||
|
for segmenti, segment := range segments {
|
||||||
|
segmentBases[segmenti] = &segment.SegmentBase
|
||||||
|
}
|
||||||
|
|
||||||
|
return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh, s)
|
||||||
|
}
|
||||||
|
|
||||||
|
func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string,
|
||||||
|
chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) (
|
||||||
|
[][]uint64, uint64, error) {
|
||||||
flag := os.O_RDWR | os.O_CREATE
|
flag := os.O_RDWR | os.O_CREATE
|
||||||
|
|
||||||
f, err := os.OpenFile(path, flag, 0600)
|
f, err := os.OpenFile(path, flag, 0600)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
cleanup := func() {
|
cleanup := func() {
|
||||||
|
@ -49,54 +63,49 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string,
|
||||||
_ = os.Remove(path)
|
_ = os.Remove(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
segmentBases := make([]*SegmentBase, len(segments))
|
|
||||||
for segmenti, segment := range segments {
|
|
||||||
segmentBases[segmenti] = &segment.SegmentBase
|
|
||||||
}
|
|
||||||
|
|
||||||
// buffer the output
|
// buffer the output
|
||||||
br := bufio.NewWriter(f)
|
br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize)
|
||||||
|
|
||||||
// wrap it for counting (tracking offsets)
|
// wrap it for counting (tracking offsets)
|
||||||
cr := NewCountHashWriter(br)
|
cr := NewCountHashWriterWithStatsReporter(br, s)
|
||||||
|
|
||||||
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err :=
|
newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err :=
|
||||||
MergeToWriter(segmentBases, drops, chunkFactor, cr)
|
MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cleanup()
|
cleanup()
|
||||||
return nil, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset,
|
err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset,
|
||||||
docValueOffset, chunkFactor, cr.Sum32(), cr)
|
docValueOffset, chunkFactor, cr.Sum32(), cr)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cleanup()
|
cleanup()
|
||||||
return nil, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = br.Flush()
|
err = br.Flush()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cleanup()
|
cleanup()
|
||||||
return nil, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = f.Sync()
|
err = f.Sync()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cleanup()
|
cleanup()
|
||||||
return nil, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = f.Close()
|
err = f.Close()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cleanup()
|
cleanup()
|
||||||
return nil, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return newDocNums, nil
|
return newDocNums, uint64(cr.Count()), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
|
func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
chunkFactor uint32, cr *CountHashWriter) (
|
chunkFactor uint32, cr *CountHashWriter, closeCh chan struct{}) (
|
||||||
newDocNums [][]uint64,
|
newDocNums [][]uint64,
|
||||||
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
|
numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64,
|
||||||
dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16,
|
dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16,
|
||||||
|
@ -108,15 +117,21 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
fieldsMap = mapFields(fieldsInv)
|
fieldsMap = mapFields(fieldsInv)
|
||||||
|
|
||||||
numDocs = computeNewDocCount(segments, drops)
|
numDocs = computeNewDocCount(segments, drops)
|
||||||
|
|
||||||
|
if isClosed(closeCh) {
|
||||||
|
return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed
|
||||||
|
}
|
||||||
|
|
||||||
if numDocs > 0 {
|
if numDocs > 0 {
|
||||||
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
|
storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops,
|
||||||
fieldsMap, fieldsInv, fieldsSame, numDocs, cr)
|
fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap,
|
dictLocs, docValueOffset, err = persistMergedRest(segments, drops,
|
||||||
newDocNums, numDocs, chunkFactor, cr)
|
fieldsInv, fieldsMap, fieldsSame,
|
||||||
|
newDocNums, numDocs, chunkFactor, cr, closeCh)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
return nil, 0, 0, 0, 0, nil, nil, nil, err
|
||||||
}
|
}
|
||||||
|
@ -156,11 +171,10 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64,
|
fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool,
|
||||||
newSegDocCount uint64, chunkFactor uint32,
|
newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32,
|
||||||
w *CountHashWriter) ([]uint64, uint64, error) {
|
w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) {
|
||||||
|
|
||||||
var bufReuse bytes.Buffer
|
|
||||||
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
|
var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64)
|
||||||
var bufLoc []uint64
|
var bufLoc []uint64
|
||||||
|
|
||||||
|
@ -168,36 +182,38 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
var postItr *PostingsIterator
|
var postItr *PostingsIterator
|
||||||
|
|
||||||
rv := make([]uint64, len(fieldsInv))
|
rv := make([]uint64, len(fieldsInv))
|
||||||
fieldDvLocs := make([]uint64, len(fieldsInv))
|
fieldDvLocsStart := make([]uint64, len(fieldsInv))
|
||||||
|
fieldDvLocsEnd := make([]uint64, len(fieldsInv))
|
||||||
|
|
||||||
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||||
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1)
|
||||||
|
|
||||||
// docTermMap is keyed by docNum, where the array impl provides
|
|
||||||
// better memory usage behavior than a sparse-friendlier hashmap
|
|
||||||
// for when docs have much structural similarity (i.e., every doc
|
|
||||||
// has a given field)
|
|
||||||
var docTermMap [][]byte
|
|
||||||
|
|
||||||
var vellumBuf bytes.Buffer
|
var vellumBuf bytes.Buffer
|
||||||
|
|
||||||
// for each field
|
|
||||||
for fieldID, fieldName := range fieldsInv {
|
|
||||||
if fieldID != 0 {
|
|
||||||
vellumBuf.Reset()
|
|
||||||
}
|
|
||||||
newVellum, err := vellum.New(&vellumBuf, nil)
|
newVellum, err := vellum.New(&vellumBuf, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
newRoaring := roaring.NewBitmap()
|
||||||
|
|
||||||
|
// for each field
|
||||||
|
for fieldID, fieldName := range fieldsInv {
|
||||||
|
|
||||||
// collect FST iterators from all active segments for this field
|
// collect FST iterators from all active segments for this field
|
||||||
var newDocNums [][]uint64
|
var newDocNums [][]uint64
|
||||||
var drops []*roaring.Bitmap
|
var drops []*roaring.Bitmap
|
||||||
var dicts []*Dictionary
|
var dicts []*Dictionary
|
||||||
var itrs []vellum.Iterator
|
var itrs []vellum.Iterator
|
||||||
|
|
||||||
|
var segmentsInFocus []*SegmentBase
|
||||||
|
|
||||||
for segmentI, segment := range segments {
|
for segmentI, segment := range segments {
|
||||||
|
|
||||||
|
// check for the closure in meantime
|
||||||
|
if isClosed(closeCh) {
|
||||||
|
return nil, 0, seg.ErrClosed
|
||||||
|
}
|
||||||
|
|
||||||
dict, err2 := segment.dictionary(fieldName)
|
dict, err2 := segment.dictionary(fieldName)
|
||||||
if err2 != nil {
|
if err2 != nil {
|
||||||
return nil, 0, err2
|
return nil, 0, err2
|
||||||
|
@ -209,89 +225,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
}
|
}
|
||||||
if itr != nil {
|
if itr != nil {
|
||||||
newDocNums = append(newDocNums, newDocNumsIn[segmentI])
|
newDocNums = append(newDocNums, newDocNumsIn[segmentI])
|
||||||
|
if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() {
|
||||||
drops = append(drops, dropsIn[segmentI])
|
drops = append(drops, dropsIn[segmentI])
|
||||||
|
} else {
|
||||||
|
drops = append(drops, nil)
|
||||||
|
}
|
||||||
dicts = append(dicts, dict)
|
dicts = append(dicts, dict)
|
||||||
itrs = append(itrs, itr)
|
itrs = append(itrs, itr)
|
||||||
|
segmentsInFocus = append(segmentsInFocus, segment)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if uint64(cap(docTermMap)) < newSegDocCount {
|
|
||||||
docTermMap = make([][]byte, newSegDocCount)
|
|
||||||
} else {
|
|
||||||
docTermMap = docTermMap[0:newSegDocCount]
|
|
||||||
for docNum := range docTermMap { // reset the docTermMap
|
|
||||||
docTermMap[docNum] = docTermMap[docNum][:0]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var prevTerm []byte
|
var prevTerm []byte
|
||||||
|
|
||||||
newRoaring := roaring.NewBitmap()
|
newRoaring.Clear()
|
||||||
newRoaringLocs := roaring.NewBitmap()
|
|
||||||
|
|
||||||
finishTerm := func(term []byte) error {
|
var lastDocNum, lastFreq, lastNorm uint64
|
||||||
if term == nil {
|
|
||||||
return nil
|
// determines whether to use "1-hit" encoding optimization
|
||||||
|
// when a term appears in only 1 doc, with no loc info,
|
||||||
|
// has freq of 1, and the docNum fits into 31-bits
|
||||||
|
use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) {
|
||||||
|
if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 {
|
||||||
|
docNum := uint64(newRoaring.Minimum())
|
||||||
|
if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 {
|
||||||
|
return true, docNum, lastNorm
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false, 0, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
finishTerm := func(term []byte) error {
|
||||||
tfEncoder.Close()
|
tfEncoder.Close()
|
||||||
locEncoder.Close()
|
locEncoder.Close()
|
||||||
|
|
||||||
if newRoaring.GetCardinality() > 0 {
|
postingsOffset, err := writePostings(newRoaring,
|
||||||
// this field/term actually has hits in the new segment, lets write it down
|
tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64)
|
||||||
freqOffset := uint64(w.Count())
|
|
||||||
_, err := tfEncoder.Write(w)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
locOffset := uint64(w.Count())
|
|
||||||
_, err = locEncoder.Write(w)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
postingLocOffset := uint64(w.Count())
|
|
||||||
_, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
postingOffset := uint64(w.Count())
|
|
||||||
|
|
||||||
// write out the start of the term info
|
|
||||||
n := binary.PutUvarint(bufMaxVarintLen64, freqOffset)
|
|
||||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// write out the start of the loc info
|
|
||||||
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
|
|
||||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
// write out the start of the posting locs
|
|
||||||
n = binary.PutUvarint(bufMaxVarintLen64, postingLocOffset)
|
|
||||||
_, err = w.Write(bufMaxVarintLen64[:n])
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
_, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = newVellum.Insert(term, postingOffset)
|
if postingsOffset > 0 {
|
||||||
|
err = newVellum.Insert(term, postingsOffset)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
newRoaring = roaring.NewBitmap()
|
newRoaring.Clear()
|
||||||
newRoaringLocs = roaring.NewBitmap()
|
|
||||||
|
|
||||||
tfEncoder.Reset()
|
tfEncoder.Reset()
|
||||||
locEncoder.Reset()
|
locEncoder.Reset()
|
||||||
|
|
||||||
|
lastDocNum = 0
|
||||||
|
lastFreq = 0
|
||||||
|
lastNorm = 0
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -301,74 +291,47 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
term, itrI, postingsOffset := enumerator.Current()
|
term, itrI, postingsOffset := enumerator.Current()
|
||||||
|
|
||||||
if !bytes.Equal(prevTerm, term) {
|
if !bytes.Equal(prevTerm, term) {
|
||||||
|
// check for the closure in meantime
|
||||||
|
if isClosed(closeCh) {
|
||||||
|
return nil, 0, seg.ErrClosed
|
||||||
|
}
|
||||||
|
|
||||||
// if the term changed, write out the info collected
|
// if the term changed, write out the info collected
|
||||||
// for the previous term
|
// for the previous term
|
||||||
err2 := finishTerm(prevTerm)
|
err = finishTerm(prevTerm)
|
||||||
if err2 != nil {
|
if err != nil {
|
||||||
return nil, 0, err2
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var err2 error
|
postings, err = dicts[itrI].postingsListFromOffset(
|
||||||
postings, err2 = dicts[itrI].postingsListFromOffset(
|
|
||||||
postingsOffset, drops[itrI], postings)
|
postingsOffset, drops[itrI], postings)
|
||||||
if err2 != nil {
|
|
||||||
return nil, 0, err2
|
|
||||||
}
|
|
||||||
|
|
||||||
newDocNumsI := newDocNums[itrI]
|
|
||||||
|
|
||||||
postItr = postings.iterator(postItr)
|
|
||||||
next, err2 := postItr.Next()
|
|
||||||
for next != nil && err2 == nil {
|
|
||||||
hitNewDocNum := newDocNumsI[next.Number()]
|
|
||||||
if hitNewDocNum == docDropped {
|
|
||||||
return nil, 0, fmt.Errorf("see hit with dropped doc num")
|
|
||||||
}
|
|
||||||
newRoaring.Add(uint32(hitNewDocNum))
|
|
||||||
// encode norm bits
|
|
||||||
norm := next.Norm()
|
|
||||||
normBits := math.Float32bits(float32(norm))
|
|
||||||
err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits))
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
locs := next.Locations()
|
|
||||||
if len(locs) > 0 {
|
postItr = postings.iterator(true, true, true, postItr)
|
||||||
newRoaringLocs.Add(uint32(hitNewDocNum))
|
|
||||||
for _, loc := range locs {
|
if fieldsSame {
|
||||||
if cap(bufLoc) < 5+len(loc.ArrayPositions()) {
|
// can optimize by copying freq/norm/loc bytes directly
|
||||||
bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions()))
|
lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying(
|
||||||
|
term, postItr, newDocNums[itrI], newRoaring,
|
||||||
|
tfEncoder, locEncoder)
|
||||||
|
} else {
|
||||||
|
lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs(
|
||||||
|
fieldsMap, term, postItr, newDocNums[itrI], newRoaring,
|
||||||
|
tfEncoder, locEncoder, bufLoc)
|
||||||
}
|
}
|
||||||
args := bufLoc[0:5]
|
|
||||||
args[0] = uint64(fieldsMap[loc.Field()] - 1)
|
|
||||||
args[1] = loc.Pos()
|
|
||||||
args[2] = loc.Start()
|
|
||||||
args[3] = loc.End()
|
|
||||||
args[4] = uint64(len(loc.ArrayPositions()))
|
|
||||||
args = append(args, loc.ArrayPositions()...)
|
|
||||||
err = locEncoder.Add(hitNewDocNum, args...)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
docTermMap[hitNewDocNum] =
|
|
||||||
append(append(docTermMap[hitNewDocNum], term...), termSeparator)
|
|
||||||
|
|
||||||
next, err2 = postItr.Next()
|
|
||||||
}
|
|
||||||
if err2 != nil {
|
|
||||||
return nil, 0, err2
|
|
||||||
}
|
|
||||||
|
|
||||||
prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem
|
prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem
|
||||||
prevTerm = append(prevTerm, term...)
|
prevTerm = append(prevTerm, term...)
|
||||||
|
|
||||||
err = enumerator.Next()
|
err = enumerator.Next()
|
||||||
}
|
}
|
||||||
if err != nil && err != vellum.ErrIteratorDone {
|
if err != vellum.ErrIteratorDone {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -400,26 +363,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
|
|
||||||
rv[fieldID] = dictOffset
|
rv[fieldID] = dictOffset
|
||||||
|
|
||||||
|
// get the field doc value offset (start)
|
||||||
|
fieldDvLocsStart[fieldID] = uint64(w.Count())
|
||||||
|
|
||||||
// update the field doc values
|
// update the field doc values
|
||||||
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1)
|
fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true)
|
||||||
for docNum, docTerms := range docTermMap {
|
|
||||||
if len(docTerms) > 0 {
|
fdvReadersAvailable := false
|
||||||
err = fdvEncoder.Add(uint64(docNum), docTerms)
|
var dvIterClone *docValueReader
|
||||||
|
for segmentI, segment := range segmentsInFocus {
|
||||||
|
// check for the closure in meantime
|
||||||
|
if isClosed(closeCh) {
|
||||||
|
return nil, 0, seg.ErrClosed
|
||||||
|
}
|
||||||
|
|
||||||
|
fieldIDPlus1 := uint16(segment.fieldsMap[fieldName])
|
||||||
|
if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists &&
|
||||||
|
dvIter != nil {
|
||||||
|
fdvReadersAvailable = true
|
||||||
|
dvIterClone = dvIter.cloneInto(dvIterClone)
|
||||||
|
err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error {
|
||||||
|
if newDocNums[segmentI][docNum] == docDropped {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if fdvReadersAvailable {
|
||||||
err = fdvEncoder.Close()
|
err = fdvEncoder.Close()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the field doc value offset
|
|
||||||
fieldDvLocs[fieldID] = uint64(w.Count())
|
|
||||||
|
|
||||||
// persist the doc value details for this field
|
// persist the doc value details for this field
|
||||||
_, err = fdvEncoder.Write(w)
|
_, err = fdvEncoder.Write()
|
||||||
|
if err != nil {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the field doc value offset (end)
|
||||||
|
fieldDvLocsEnd[fieldID] = uint64(w.Count())
|
||||||
|
} else {
|
||||||
|
fieldDvLocsStart[fieldID] = fieldNotUninverted
|
||||||
|
fieldDvLocsEnd[fieldID] = fieldNotUninverted
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset vellum buffer and vellum builder
|
||||||
|
vellumBuf.Reset()
|
||||||
|
err = newVellum.Reset(&vellumBuf)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
@ -428,38 +428,210 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap,
|
||||||
fieldDvLocsOffset := uint64(w.Count())
|
fieldDvLocsOffset := uint64(w.Count())
|
||||||
|
|
||||||
buf := bufMaxVarintLen64
|
buf := bufMaxVarintLen64
|
||||||
for _, offset := range fieldDvLocs {
|
for i := 0; i < len(fieldDvLocsStart); i++ {
|
||||||
n := binary.PutUvarint(buf, uint64(offset))
|
n := binary.PutUvarint(buf, fieldDvLocsStart[i])
|
||||||
_, err := w.Write(buf[:n])
|
_, err := w.Write(buf[:n])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, 0, err
|
return nil, 0, err
|
||||||
}
|
}
|
||||||
|
n = binary.PutUvarint(buf, fieldDvLocsEnd[i])
|
||||||
|
_, err = w.Write(buf[:n])
|
||||||
|
if err != nil {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return rv, fieldDvLocsOffset, nil
|
return rv, fieldDvLocsOffset, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator,
|
||||||
|
newDocNums []uint64, newRoaring *roaring.Bitmap,
|
||||||
|
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) (
|
||||||
|
lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) {
|
||||||
|
next, err := postItr.Next()
|
||||||
|
for next != nil && err == nil {
|
||||||
|
hitNewDocNum := newDocNums[next.Number()]
|
||||||
|
if hitNewDocNum == docDropped {
|
||||||
|
return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum")
|
||||||
|
}
|
||||||
|
|
||||||
|
newRoaring.Add(uint32(hitNewDocNum))
|
||||||
|
|
||||||
|
nextFreq := next.Frequency()
|
||||||
|
nextNorm := uint64(math.Float32bits(float32(next.Norm())))
|
||||||
|
|
||||||
|
locs := next.Locations()
|
||||||
|
|
||||||
|
err = tfEncoder.Add(hitNewDocNum,
|
||||||
|
encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm)
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(locs) > 0 {
|
||||||
|
numBytesLocs := 0
|
||||||
|
for _, loc := range locs {
|
||||||
|
ap := loc.ArrayPositions()
|
||||||
|
numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1),
|
||||||
|
loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs))
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, loc := range locs {
|
||||||
|
ap := loc.ArrayPositions()
|
||||||
|
if cap(bufLoc) < 5+len(ap) {
|
||||||
|
bufLoc = make([]uint64, 0, 5+len(ap))
|
||||||
|
}
|
||||||
|
args := bufLoc[0:5]
|
||||||
|
args[0] = uint64(fieldsMap[loc.Field()] - 1)
|
||||||
|
args[1] = loc.Pos()
|
||||||
|
args[2] = loc.Start()
|
||||||
|
args[3] = loc.End()
|
||||||
|
args[4] = uint64(len(ap))
|
||||||
|
args = append(args, ap...)
|
||||||
|
err = locEncoder.Add(hitNewDocNum, args...)
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lastDocNum = hitNewDocNum
|
||||||
|
lastFreq = nextFreq
|
||||||
|
lastNorm = nextNorm
|
||||||
|
|
||||||
|
next, err = postItr.Next()
|
||||||
|
}
|
||||||
|
|
||||||
|
return lastDocNum, lastFreq, lastNorm, bufLoc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator,
|
||||||
|
newDocNums []uint64, newRoaring *roaring.Bitmap,
|
||||||
|
tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) (
|
||||||
|
lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) {
|
||||||
|
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err :=
|
||||||
|
postItr.nextBytes()
|
||||||
|
for err == nil && len(nextFreqNormBytes) > 0 {
|
||||||
|
hitNewDocNum := newDocNums[nextDocNum]
|
||||||
|
if hitNewDocNum == docDropped {
|
||||||
|
return 0, 0, 0, fmt.Errorf("see hit with dropped doc num")
|
||||||
|
}
|
||||||
|
|
||||||
|
newRoaring.Add(uint32(hitNewDocNum))
|
||||||
|
err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes)
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(nextLocBytes) > 0 {
|
||||||
|
err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes)
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lastDocNum = hitNewDocNum
|
||||||
|
lastFreq = nextFreq
|
||||||
|
lastNorm = nextNorm
|
||||||
|
|
||||||
|
nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err =
|
||||||
|
postItr.nextBytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
return lastDocNum, lastFreq, lastNorm, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder,
|
||||||
|
use1HitEncoding func(uint64) (bool, uint64, uint64),
|
||||||
|
w *CountHashWriter, bufMaxVarintLen64 []byte) (
|
||||||
|
offset uint64, err error) {
|
||||||
|
termCardinality := postings.GetCardinality()
|
||||||
|
if termCardinality <= 0 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
if use1HitEncoding != nil {
|
||||||
|
encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality)
|
||||||
|
if encodeAs1Hit {
|
||||||
|
return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tfOffset := uint64(w.Count())
|
||||||
|
_, err = tfEncoder.Write(w)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
locOffset := uint64(w.Count())
|
||||||
|
_, err = locEncoder.Write(w)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
postingsOffset := uint64(w.Count())
|
||||||
|
|
||||||
|
n := binary.PutUvarint(bufMaxVarintLen64, tfOffset)
|
||||||
|
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
n = binary.PutUvarint(bufMaxVarintLen64, locOffset)
|
||||||
|
_, err = w.Write(bufMaxVarintLen64[:n])
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return postingsOffset, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type varintEncoder func(uint64) (int, error)
|
||||||
|
|
||||||
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64,
|
fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64,
|
||||||
w *CountHashWriter) (uint64, [][]uint64, error) {
|
w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) {
|
||||||
var rv [][]uint64 // The remapped or newDocNums for each segment.
|
var rv [][]uint64 // The remapped or newDocNums for each segment.
|
||||||
|
|
||||||
var newDocNum uint64
|
var newDocNum uint64
|
||||||
|
|
||||||
var curr int
|
var curr int
|
||||||
var metaBuf bytes.Buffer
|
|
||||||
var data, compressed []byte
|
var data, compressed []byte
|
||||||
|
var metaBuf bytes.Buffer
|
||||||
metaEncoder := govarint.NewU64Base128Encoder(&metaBuf)
|
varBuf := make([]byte, binary.MaxVarintLen64)
|
||||||
|
metaEncode := func(val uint64) (int, error) {
|
||||||
|
wb := binary.PutUvarint(varBuf, val)
|
||||||
|
return metaBuf.Write(varBuf[:wb])
|
||||||
|
}
|
||||||
|
|
||||||
vals := make([][][]byte, len(fieldsInv))
|
vals := make([][][]byte, len(fieldsInv))
|
||||||
typs := make([][]byte, len(fieldsInv))
|
typs := make([][]byte, len(fieldsInv))
|
||||||
poss := make([][][]uint64, len(fieldsInv))
|
poss := make([][][]uint64, len(fieldsInv))
|
||||||
|
|
||||||
|
var posBuf []uint64
|
||||||
|
|
||||||
docNumOffsets := make([]uint64, newSegDocCount)
|
docNumOffsets := make([]uint64, newSegDocCount)
|
||||||
|
|
||||||
|
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
|
||||||
|
defer visitDocumentCtxPool.Put(vdc)
|
||||||
|
|
||||||
// for each segment
|
// for each segment
|
||||||
for segI, segment := range segments {
|
for segI, segment := range segments {
|
||||||
|
// check for the closure in meantime
|
||||||
|
if isClosed(closeCh) {
|
||||||
|
return 0, nil, seg.ErrClosed
|
||||||
|
}
|
||||||
|
|
||||||
segNewDocNums := make([]uint64, segment.numDocs)
|
segNewDocNums := make([]uint64, segment.numDocs)
|
||||||
|
|
||||||
dropsI := drops[segI]
|
dropsI := drops[segI]
|
||||||
|
@ -495,7 +667,8 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
curr = 0
|
curr = 0
|
||||||
metaBuf.Reset()
|
metaBuf.Reset()
|
||||||
data = data[:0]
|
data = data[:0]
|
||||||
compressed = compressed[:0]
|
|
||||||
|
posTemp := posBuf
|
||||||
|
|
||||||
// collect all the data
|
// collect all the data
|
||||||
for i := 0; i < len(fieldsInv); i++ {
|
for i := 0; i < len(fieldsInv); i++ {
|
||||||
|
@ -503,42 +676,63 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
typs[i] = typs[i][:0]
|
typs[i] = typs[i][:0]
|
||||||
poss[i] = poss[i][:0]
|
poss[i] = poss[i][:0]
|
||||||
}
|
}
|
||||||
err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
||||||
fieldID := int(fieldsMap[field]) - 1
|
fieldID := int(fieldsMap[field]) - 1
|
||||||
vals[fieldID] = append(vals[fieldID], value)
|
vals[fieldID] = append(vals[fieldID], value)
|
||||||
typs[fieldID] = append(typs[fieldID], typ)
|
typs[fieldID] = append(typs[fieldID], typ)
|
||||||
poss[fieldID] = append(poss[fieldID], pos)
|
|
||||||
|
// copy array positions to preserve them beyond the scope of this callback
|
||||||
|
var curPos []uint64
|
||||||
|
if len(pos) > 0 {
|
||||||
|
if cap(posTemp) < len(pos) {
|
||||||
|
posBuf = make([]uint64, len(pos)*len(fieldsInv))
|
||||||
|
posTemp = posBuf
|
||||||
|
}
|
||||||
|
curPos = posTemp[0:len(pos)]
|
||||||
|
copy(curPos, pos)
|
||||||
|
posTemp = posTemp[len(pos):]
|
||||||
|
}
|
||||||
|
poss[fieldID] = append(poss[fieldID], curPos)
|
||||||
|
|
||||||
return true
|
return true
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, err
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// now walk the fields in order
|
// _id field special case optimizes ExternalID() lookups
|
||||||
for fieldID := range fieldsInv {
|
idFieldVal := vals[uint16(0)][0]
|
||||||
storedFieldValues := vals[int(fieldID)]
|
_, err = metaEncode(uint64(len(idFieldVal)))
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
stf := typs[int(fieldID)]
|
// now walk the non-"_id" fields in order
|
||||||
spf := poss[int(fieldID)]
|
for fieldID := 1; fieldID < len(fieldsInv); fieldID++ {
|
||||||
|
storedFieldValues := vals[fieldID]
|
||||||
|
|
||||||
|
stf := typs[fieldID]
|
||||||
|
spf := poss[fieldID]
|
||||||
|
|
||||||
var err2 error
|
var err2 error
|
||||||
curr, data, err2 = persistStoredFieldValues(fieldID,
|
curr, data, err2 = persistStoredFieldValues(fieldID,
|
||||||
storedFieldValues, stf, spf, curr, metaEncoder, data)
|
storedFieldValues, stf, spf, curr, metaEncode, data)
|
||||||
if err2 != nil {
|
if err2 != nil {
|
||||||
return 0, nil, err2
|
return 0, nil, err2
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
metaEncoder.Close()
|
|
||||||
metaBytes := metaBuf.Bytes()
|
metaBytes := metaBuf.Bytes()
|
||||||
|
|
||||||
compressed = snappy.Encode(compressed, data)
|
compressed = snappy.Encode(compressed[:cap(compressed)], data)
|
||||||
|
|
||||||
// record where we're about to start writing
|
// record where we're about to start writing
|
||||||
docNumOffsets[newDocNum] = uint64(w.Count())
|
docNumOffsets[newDocNum] = uint64(w.Count())
|
||||||
|
|
||||||
// write out the meta len and compressed data len
|
// write out the meta len and compressed data len
|
||||||
_, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed)))
|
_, err = writeUvarints(w,
|
||||||
|
uint64(len(metaBytes)),
|
||||||
|
uint64(len(idFieldVal)+len(compressed)))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, err
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
|
@ -547,6 +741,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap,
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, nil, err
|
return 0, nil, err
|
||||||
}
|
}
|
||||||
|
// now write the _id field val (counted as part of the 'compressed' data)
|
||||||
|
_, err = w.Write(idFieldVal)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
// now write the compressed data
|
// now write the compressed data
|
||||||
_, err = w.Write(compressed)
|
_, err = w.Write(compressed)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -644,3 +843,12 @@ func mergeFields(segments []*SegmentBase) (bool, []string) {
|
||||||
|
|
||||||
return fieldsSame, rv
|
return fieldsSame, rv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isClosed(closeCh chan struct{}) bool {
|
||||||
|
select {
|
||||||
|
case <-closeCh:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
826
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go
generated
vendored
Normal file
826
vendor/github.com/blevesearch/bleve/index/scorch/segment/zap/new.go
generated
vendored
Normal file
|
@ -0,0 +1,826 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package zap
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"encoding/binary"
|
||||||
|
"math"
|
||||||
|
"sort"
|
||||||
|
"sync"
|
||||||
|
|
||||||
|
"github.com/RoaringBitmap/roaring"
|
||||||
|
"github.com/blevesearch/bleve/analysis"
|
||||||
|
"github.com/blevesearch/bleve/document"
|
||||||
|
"github.com/blevesearch/bleve/index"
|
||||||
|
"github.com/couchbase/vellum"
|
||||||
|
"github.com/golang/snappy"
|
||||||
|
)
|
||||||
|
|
||||||
|
var NewSegmentBufferNumResultsBump int = 100
|
||||||
|
var NewSegmentBufferNumResultsFactor float64 = 1.0
|
||||||
|
var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0
|
||||||
|
|
||||||
|
// AnalysisResultsToSegmentBase produces an in-memory zap-encoded
|
||||||
|
// SegmentBase from analysis results
|
||||||
|
func AnalysisResultsToSegmentBase(results []*index.AnalysisResult,
|
||||||
|
chunkFactor uint32) (*SegmentBase, uint64, error) {
|
||||||
|
s := interimPool.Get().(*interim)
|
||||||
|
|
||||||
|
var br bytes.Buffer
|
||||||
|
if s.lastNumDocs > 0 {
|
||||||
|
// use previous results to initialize the buf with an estimate
|
||||||
|
// size, but note that the interim instance comes from a
|
||||||
|
// global interimPool, so multiple scorch instances indexing
|
||||||
|
// different docs can lead to low quality estimates
|
||||||
|
estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) *
|
||||||
|
NewSegmentBufferNumResultsFactor)
|
||||||
|
estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) *
|
||||||
|
NewSegmentBufferAvgBytesPerDocFactor)
|
||||||
|
br.Grow(estimateAvgBytesPerDoc * estimateNumResults)
|
||||||
|
}
|
||||||
|
|
||||||
|
s.results = results
|
||||||
|
s.chunkFactor = chunkFactor
|
||||||
|
s.w = NewCountHashWriter(&br)
|
||||||
|
|
||||||
|
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets,
|
||||||
|
err := s.convert()
|
||||||
|
if err != nil {
|
||||||
|
return nil, uint64(0), err
|
||||||
|
}
|
||||||
|
|
||||||
|
sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor,
|
||||||
|
s.FieldsMap, s.FieldsInv, uint64(len(results)),
|
||||||
|
storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets)
|
||||||
|
|
||||||
|
if err == nil && s.reset() == nil {
|
||||||
|
s.lastNumDocs = len(results)
|
||||||
|
s.lastOutSize = len(br.Bytes())
|
||||||
|
interimPool.Put(s)
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb, uint64(len(br.Bytes())), err
|
||||||
|
}
|
||||||
|
|
||||||
|
var interimPool = sync.Pool{New: func() interface{} { return &interim{} }}
|
||||||
|
|
||||||
|
// interim holds temporary working data used while converting from
|
||||||
|
// analysis results to a zap-encoded segment
|
||||||
|
type interim struct {
|
||||||
|
results []*index.AnalysisResult
|
||||||
|
|
||||||
|
chunkFactor uint32
|
||||||
|
|
||||||
|
w *CountHashWriter
|
||||||
|
|
||||||
|
// FieldsMap adds 1 to field id to avoid zero value issues
|
||||||
|
// name -> field id + 1
|
||||||
|
FieldsMap map[string]uint16
|
||||||
|
|
||||||
|
// FieldsInv is the inverse of FieldsMap
|
||||||
|
// field id -> name
|
||||||
|
FieldsInv []string
|
||||||
|
|
||||||
|
// Term dictionaries for each field
|
||||||
|
// field id -> term -> postings list id + 1
|
||||||
|
Dicts []map[string]uint64
|
||||||
|
|
||||||
|
// Terms for each field, where terms are sorted ascending
|
||||||
|
// field id -> []term
|
||||||
|
DictKeys [][]string
|
||||||
|
|
||||||
|
// Fields whose IncludeDocValues is true
|
||||||
|
// field id -> bool
|
||||||
|
IncludeDocValues []bool
|
||||||
|
|
||||||
|
// postings id -> bitmap of docNums
|
||||||
|
Postings []*roaring.Bitmap
|
||||||
|
|
||||||
|
// postings id -> freq/norm's, one for each docNum in postings
|
||||||
|
FreqNorms [][]interimFreqNorm
|
||||||
|
freqNormsBacking []interimFreqNorm
|
||||||
|
|
||||||
|
// postings id -> locs, one for each freq
|
||||||
|
Locs [][]interimLoc
|
||||||
|
locsBacking []interimLoc
|
||||||
|
|
||||||
|
numTermsPerPostingsList []int // key is postings list id
|
||||||
|
numLocsPerPostingsList []int // key is postings list id
|
||||||
|
|
||||||
|
builder *vellum.Builder
|
||||||
|
builderBuf bytes.Buffer
|
||||||
|
|
||||||
|
metaBuf bytes.Buffer
|
||||||
|
|
||||||
|
tmp0 []byte
|
||||||
|
tmp1 []byte
|
||||||
|
|
||||||
|
lastNumDocs int
|
||||||
|
lastOutSize int
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *interim) reset() (err error) {
|
||||||
|
s.results = nil
|
||||||
|
s.chunkFactor = 0
|
||||||
|
s.w = nil
|
||||||
|
s.FieldsMap = nil
|
||||||
|
s.FieldsInv = nil
|
||||||
|
for i := range s.Dicts {
|
||||||
|
s.Dicts[i] = nil
|
||||||
|
}
|
||||||
|
s.Dicts = s.Dicts[:0]
|
||||||
|
for i := range s.DictKeys {
|
||||||
|
s.DictKeys[i] = s.DictKeys[i][:0]
|
||||||
|
}
|
||||||
|
s.DictKeys = s.DictKeys[:0]
|
||||||
|
for i := range s.IncludeDocValues {
|
||||||
|
s.IncludeDocValues[i] = false
|
||||||
|
}
|
||||||
|
s.IncludeDocValues = s.IncludeDocValues[:0]
|
||||||
|
for _, idn := range s.Postings {
|
||||||
|
idn.Clear()
|
||||||
|
}
|
||||||
|
s.Postings = s.Postings[:0]
|
||||||
|
s.FreqNorms = s.FreqNorms[:0]
|
||||||
|
for i := range s.freqNormsBacking {
|
||||||
|
s.freqNormsBacking[i] = interimFreqNorm{}
|
||||||
|
}
|
||||||
|
s.freqNormsBacking = s.freqNormsBacking[:0]
|
||||||
|
s.Locs = s.Locs[:0]
|
||||||
|
for i := range s.locsBacking {
|
||||||
|
s.locsBacking[i] = interimLoc{}
|
||||||
|
}
|
||||||
|
s.locsBacking = s.locsBacking[:0]
|
||||||
|
s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0]
|
||||||
|
s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0]
|
||||||
|
s.builderBuf.Reset()
|
||||||
|
if s.builder != nil {
|
||||||
|
err = s.builder.Reset(&s.builderBuf)
|
||||||
|
}
|
||||||
|
s.metaBuf.Reset()
|
||||||
|
s.tmp0 = s.tmp0[:0]
|
||||||
|
s.tmp1 = s.tmp1[:0]
|
||||||
|
s.lastNumDocs = 0
|
||||||
|
s.lastOutSize = 0
|
||||||
|
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *interim) grabBuf(size int) []byte {
|
||||||
|
buf := s.tmp0
|
||||||
|
if cap(buf) < size {
|
||||||
|
buf = make([]byte, size)
|
||||||
|
s.tmp0 = buf
|
||||||
|
}
|
||||||
|
return buf[0:size]
|
||||||
|
}
|
||||||
|
|
||||||
|
type interimStoredField struct {
|
||||||
|
vals [][]byte
|
||||||
|
typs []byte
|
||||||
|
arrayposs [][]uint64 // array positions
|
||||||
|
}
|
||||||
|
|
||||||
|
type interimFreqNorm struct {
|
||||||
|
freq uint64
|
||||||
|
norm float32
|
||||||
|
numLocs int
|
||||||
|
}
|
||||||
|
|
||||||
|
type interimLoc struct {
|
||||||
|
fieldID uint16
|
||||||
|
pos uint64
|
||||||
|
start uint64
|
||||||
|
end uint64
|
||||||
|
arrayposs []uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) {
|
||||||
|
s.FieldsMap = map[string]uint16{}
|
||||||
|
|
||||||
|
s.getOrDefineField("_id") // _id field is fieldID 0
|
||||||
|
|
||||||
|
for _, result := range s.results {
|
||||||
|
for _, field := range result.Document.CompositeFields {
|
||||||
|
s.getOrDefineField(field.Name())
|
||||||
|
}
|
||||||
|
for _, field := range result.Document.Fields {
|
||||||
|
s.getOrDefineField(field.Name())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Strings(s.FieldsInv[1:]) // keep _id as first field
|
||||||
|
|
||||||
|
for fieldID, fieldName := range s.FieldsInv {
|
||||||
|
s.FieldsMap[fieldName] = uint16(fieldID + 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cap(s.IncludeDocValues) >= len(s.FieldsInv) {
|
||||||
|
s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)]
|
||||||
|
} else {
|
||||||
|
s.IncludeDocValues = make([]bool, len(s.FieldsInv))
|
||||||
|
}
|
||||||
|
|
||||||
|
s.prepareDicts()
|
||||||
|
|
||||||
|
for _, dict := range s.DictKeys {
|
||||||
|
sort.Strings(dict)
|
||||||
|
}
|
||||||
|
|
||||||
|
s.processDocuments()
|
||||||
|
|
||||||
|
storedIndexOffset, err := s.writeStoredFields()
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var fdvIndexOffset uint64
|
||||||
|
var dictOffsets []uint64
|
||||||
|
|
||||||
|
if len(s.results) > 0 {
|
||||||
|
fdvIndexOffset, dictOffsets, err = s.writeDicts()
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, nil, err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
dictOffsets = make([]uint64, len(s.FieldsInv))
|
||||||
|
}
|
||||||
|
|
||||||
|
fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets)
|
||||||
|
if err != nil {
|
||||||
|
return 0, 0, 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *interim) getOrDefineField(fieldName string) int {
|
||||||
|
fieldIDPlus1, exists := s.FieldsMap[fieldName]
|
||||||
|
if !exists {
|
||||||
|
fieldIDPlus1 = uint16(len(s.FieldsInv) + 1)
|
||||||
|
s.FieldsMap[fieldName] = fieldIDPlus1
|
||||||
|
s.FieldsInv = append(s.FieldsInv, fieldName)
|
||||||
|
|
||||||
|
s.Dicts = append(s.Dicts, make(map[string]uint64))
|
||||||
|
|
||||||
|
n := len(s.DictKeys)
|
||||||
|
if n < cap(s.DictKeys) {
|
||||||
|
s.DictKeys = s.DictKeys[:n+1]
|
||||||
|
s.DictKeys[n] = s.DictKeys[n][:0]
|
||||||
|
} else {
|
||||||
|
s.DictKeys = append(s.DictKeys, []string(nil))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return int(fieldIDPlus1 - 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// fill Dicts and DictKeys from analysis results
|
||||||
|
func (s *interim) prepareDicts() {
|
||||||
|
var pidNext int
|
||||||
|
|
||||||
|
var totTFs int
|
||||||
|
var totLocs int
|
||||||
|
|
||||||
|
visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) {
|
||||||
|
dict := s.Dicts[fieldID]
|
||||||
|
dictKeys := s.DictKeys[fieldID]
|
||||||
|
|
||||||
|
for term, tf := range tfs {
|
||||||
|
pidPlus1, exists := dict[term]
|
||||||
|
if !exists {
|
||||||
|
pidNext++
|
||||||
|
pidPlus1 = uint64(pidNext)
|
||||||
|
|
||||||
|
dict[term] = pidPlus1
|
||||||
|
dictKeys = append(dictKeys, term)
|
||||||
|
|
||||||
|
s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0)
|
||||||
|
s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
pid := pidPlus1 - 1
|
||||||
|
|
||||||
|
s.numTermsPerPostingsList[pid] += 1
|
||||||
|
s.numLocsPerPostingsList[pid] += len(tf.Locations)
|
||||||
|
|
||||||
|
totLocs += len(tf.Locations)
|
||||||
|
}
|
||||||
|
|
||||||
|
totTFs += len(tfs)
|
||||||
|
|
||||||
|
s.DictKeys[fieldID] = dictKeys
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, result := range s.results {
|
||||||
|
// walk each composite field
|
||||||
|
for _, field := range result.Document.CompositeFields {
|
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||||
|
_, tf := field.Analyze()
|
||||||
|
visitField(fieldID, tf)
|
||||||
|
}
|
||||||
|
|
||||||
|
// walk each field
|
||||||
|
for i, field := range result.Document.Fields {
|
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||||
|
tf := result.Analyzed[i]
|
||||||
|
visitField(fieldID, tf)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
numPostingsLists := pidNext
|
||||||
|
|
||||||
|
if cap(s.Postings) >= numPostingsLists {
|
||||||
|
s.Postings = s.Postings[:numPostingsLists]
|
||||||
|
} else {
|
||||||
|
postings := make([]*roaring.Bitmap, numPostingsLists)
|
||||||
|
copy(postings, s.Postings[:cap(s.Postings)])
|
||||||
|
for i := 0; i < numPostingsLists; i++ {
|
||||||
|
if postings[i] == nil {
|
||||||
|
postings[i] = roaring.New()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s.Postings = postings
|
||||||
|
}
|
||||||
|
|
||||||
|
if cap(s.FreqNorms) >= numPostingsLists {
|
||||||
|
s.FreqNorms = s.FreqNorms[:numPostingsLists]
|
||||||
|
} else {
|
||||||
|
s.FreqNorms = make([][]interimFreqNorm, numPostingsLists)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cap(s.freqNormsBacking) >= totTFs {
|
||||||
|
s.freqNormsBacking = s.freqNormsBacking[:totTFs]
|
||||||
|
} else {
|
||||||
|
s.freqNormsBacking = make([]interimFreqNorm, totTFs)
|
||||||
|
}
|
||||||
|
|
||||||
|
freqNormsBacking := s.freqNormsBacking
|
||||||
|
for pid, numTerms := range s.numTermsPerPostingsList {
|
||||||
|
s.FreqNorms[pid] = freqNormsBacking[0:0]
|
||||||
|
freqNormsBacking = freqNormsBacking[numTerms:]
|
||||||
|
}
|
||||||
|
|
||||||
|
if cap(s.Locs) >= numPostingsLists {
|
||||||
|
s.Locs = s.Locs[:numPostingsLists]
|
||||||
|
} else {
|
||||||
|
s.Locs = make([][]interimLoc, numPostingsLists)
|
||||||
|
}
|
||||||
|
|
||||||
|
if cap(s.locsBacking) >= totLocs {
|
||||||
|
s.locsBacking = s.locsBacking[:totLocs]
|
||||||
|
} else {
|
||||||
|
s.locsBacking = make([]interimLoc, totLocs)
|
||||||
|
}
|
||||||
|
|
||||||
|
locsBacking := s.locsBacking
|
||||||
|
for pid, numLocs := range s.numLocsPerPostingsList {
|
||||||
|
s.Locs[pid] = locsBacking[0:0]
|
||||||
|
locsBacking = locsBacking[numLocs:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *interim) processDocuments() {
|
||||||
|
numFields := len(s.FieldsInv)
|
||||||
|
reuseFieldLens := make([]int, numFields)
|
||||||
|
reuseFieldTFs := make([]analysis.TokenFrequencies, numFields)
|
||||||
|
|
||||||
|
for docNum, result := range s.results {
|
||||||
|
for i := 0; i < numFields; i++ { // clear these for reuse
|
||||||
|
reuseFieldLens[i] = 0
|
||||||
|
reuseFieldTFs[i] = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
s.processDocument(uint64(docNum), result,
|
||||||
|
reuseFieldLens, reuseFieldTFs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *interim) processDocument(docNum uint64,
|
||||||
|
result *index.AnalysisResult,
|
||||||
|
fieldLens []int, fieldTFs []analysis.TokenFrequencies) {
|
||||||
|
visitField := func(fieldID uint16, fieldName string,
|
||||||
|
ln int, tf analysis.TokenFrequencies) {
|
||||||
|
fieldLens[fieldID] += ln
|
||||||
|
|
||||||
|
existingFreqs := fieldTFs[fieldID]
|
||||||
|
if existingFreqs != nil {
|
||||||
|
existingFreqs.MergeAll(fieldName, tf)
|
||||||
|
} else {
|
||||||
|
fieldTFs[fieldID] = tf
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// walk each composite field
|
||||||
|
for _, field := range result.Document.CompositeFields {
|
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||||
|
ln, tf := field.Analyze()
|
||||||
|
visitField(fieldID, field.Name(), ln, tf)
|
||||||
|
}
|
||||||
|
|
||||||
|
// walk each field
|
||||||
|
for i, field := range result.Document.Fields {
|
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||||
|
ln := result.Length[i]
|
||||||
|
tf := result.Analyzed[i]
|
||||||
|
visitField(fieldID, field.Name(), ln, tf)
|
||||||
|
}
|
||||||
|
|
||||||
|
// now that it's been rolled up into fieldTFs, walk that
|
||||||
|
for fieldID, tfs := range fieldTFs {
|
||||||
|
dict := s.Dicts[fieldID]
|
||||||
|
norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))
|
||||||
|
|
||||||
|
for term, tf := range tfs {
|
||||||
|
pid := dict[term] - 1
|
||||||
|
bs := s.Postings[pid]
|
||||||
|
bs.Add(uint32(docNum))
|
||||||
|
|
||||||
|
s.FreqNorms[pid] = append(s.FreqNorms[pid],
|
||||||
|
interimFreqNorm{
|
||||||
|
freq: uint64(tf.Frequency()),
|
||||||
|
norm: norm,
|
||||||
|
numLocs: len(tf.Locations),
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(tf.Locations) > 0 {
|
||||||
|
locs := s.Locs[pid]
|
||||||
|
|
||||||
|
for _, loc := range tf.Locations {
|
||||||
|
var locf = uint16(fieldID)
|
||||||
|
if loc.Field != "" {
|
||||||
|
locf = uint16(s.getOrDefineField(loc.Field))
|
||||||
|
}
|
||||||
|
var arrayposs []uint64
|
||||||
|
if len(loc.ArrayPositions) > 0 {
|
||||||
|
arrayposs = loc.ArrayPositions
|
||||||
|
}
|
||||||
|
locs = append(locs, interimLoc{
|
||||||
|
fieldID: locf,
|
||||||
|
pos: uint64(loc.Position),
|
||||||
|
start: uint64(loc.Start),
|
||||||
|
end: uint64(loc.End),
|
||||||
|
arrayposs: arrayposs,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
s.Locs[pid] = locs
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *interim) writeStoredFields() (
|
||||||
|
storedIndexOffset uint64, err error) {
|
||||||
|
varBuf := make([]byte, binary.MaxVarintLen64)
|
||||||
|
metaEncode := func(val uint64) (int, error) {
|
||||||
|
wb := binary.PutUvarint(varBuf, val)
|
||||||
|
return s.metaBuf.Write(varBuf[:wb])
|
||||||
|
}
|
||||||
|
|
||||||
|
data, compressed := s.tmp0[:0], s.tmp1[:0]
|
||||||
|
defer func() { s.tmp0, s.tmp1 = data, compressed }()
|
||||||
|
|
||||||
|
// keyed by docNum
|
||||||
|
docStoredOffsets := make([]uint64, len(s.results))
|
||||||
|
|
||||||
|
// keyed by fieldID, for the current doc in the loop
|
||||||
|
docStoredFields := map[uint16]interimStoredField{}
|
||||||
|
|
||||||
|
for docNum, result := range s.results {
|
||||||
|
for fieldID := range docStoredFields { // reset for next doc
|
||||||
|
delete(docStoredFields, fieldID)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, field := range result.Document.Fields {
|
||||||
|
fieldID := uint16(s.getOrDefineField(field.Name()))
|
||||||
|
|
||||||
|
opts := field.Options()
|
||||||
|
|
||||||
|
if opts.IsStored() {
|
||||||
|
isf := docStoredFields[fieldID]
|
||||||
|
isf.vals = append(isf.vals, field.Value())
|
||||||
|
isf.typs = append(isf.typs, encodeFieldType(field))
|
||||||
|
isf.arrayposs = append(isf.arrayposs, field.ArrayPositions())
|
||||||
|
docStoredFields[fieldID] = isf
|
||||||
|
}
|
||||||
|
|
||||||
|
if opts.IncludeDocValues() {
|
||||||
|
s.IncludeDocValues[fieldID] = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var curr int
|
||||||
|
|
||||||
|
s.metaBuf.Reset()
|
||||||
|
data = data[:0]
|
||||||
|
|
||||||
|
// _id field special case optimizes ExternalID() lookups
|
||||||
|
idFieldVal := docStoredFields[uint16(0)].vals[0]
|
||||||
|
_, err = metaEncode(uint64(len(idFieldVal)))
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle non-"_id" fields
|
||||||
|
for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ {
|
||||||
|
isf, exists := docStoredFields[uint16(fieldID)]
|
||||||
|
if exists {
|
||||||
|
curr, data, err = persistStoredFieldValues(
|
||||||
|
fieldID, isf.vals, isf.typs, isf.arrayposs,
|
||||||
|
curr, metaEncode, data)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
metaBytes := s.metaBuf.Bytes()
|
||||||
|
|
||||||
|
compressed = snappy.Encode(compressed[:cap(compressed)], data)
|
||||||
|
|
||||||
|
docStoredOffsets[docNum] = uint64(s.w.Count())
|
||||||
|
|
||||||
|
_, err := writeUvarints(s.w,
|
||||||
|
uint64(len(metaBytes)),
|
||||||
|
uint64(len(idFieldVal)+len(compressed)))
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = s.w.Write(metaBytes)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = s.w.Write(idFieldVal)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
|
||||||
|
_, err = s.w.Write(compressed)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
storedIndexOffset = uint64(s.w.Count())
|
||||||
|
|
||||||
|
for _, docStoredOffset := range docStoredOffsets {
|
||||||
|
err = binary.Write(s.w, binary.BigEndian, docStoredOffset)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return storedIndexOffset, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) {
|
||||||
|
dictOffsets = make([]uint64, len(s.FieldsInv))
|
||||||
|
|
||||||
|
fdvOffsetsStart := make([]uint64, len(s.FieldsInv))
|
||||||
|
fdvOffsetsEnd := make([]uint64, len(s.FieldsInv))
|
||||||
|
|
||||||
|
buf := s.grabBuf(binary.MaxVarintLen64)
|
||||||
|
|
||||||
|
tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
|
||||||
|
locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1))
|
||||||
|
fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false)
|
||||||
|
|
||||||
|
var docTermMap [][]byte
|
||||||
|
|
||||||
|
if s.builder == nil {
|
||||||
|
s.builder, err = vellum.New(&s.builderBuf, nil)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for fieldID, terms := range s.DictKeys {
|
||||||
|
if cap(docTermMap) < len(s.results) {
|
||||||
|
docTermMap = make([][]byte, len(s.results))
|
||||||
|
} else {
|
||||||
|
docTermMap = docTermMap[0:len(s.results)]
|
||||||
|
for docNum := range docTermMap { // reset the docTermMap
|
||||||
|
docTermMap[docNum] = docTermMap[docNum][:0]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dict := s.Dicts[fieldID]
|
||||||
|
|
||||||
|
for _, term := range terms { // terms are already sorted
|
||||||
|
pid := dict[term] - 1
|
||||||
|
|
||||||
|
postingsBS := s.Postings[pid]
|
||||||
|
|
||||||
|
freqNorms := s.FreqNorms[pid]
|
||||||
|
freqNormOffset := 0
|
||||||
|
|
||||||
|
locs := s.Locs[pid]
|
||||||
|
locOffset := 0
|
||||||
|
|
||||||
|
postingsItr := postingsBS.Iterator()
|
||||||
|
for postingsItr.HasNext() {
|
||||||
|
docNum := uint64(postingsItr.Next())
|
||||||
|
|
||||||
|
freqNorm := freqNorms[freqNormOffset]
|
||||||
|
|
||||||
|
err = tfEncoder.Add(docNum,
|
||||||
|
encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0),
|
||||||
|
uint64(math.Float32bits(freqNorm.norm)))
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if freqNorm.numLocs > 0 {
|
||||||
|
numBytesLocs := 0
|
||||||
|
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
|
||||||
|
numBytesLocs += totalUvarintBytes(
|
||||||
|
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
|
||||||
|
uint64(len(loc.arrayposs)), loc.arrayposs)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = locEncoder.Add(docNum, uint64(numBytesLocs))
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] {
|
||||||
|
err = locEncoder.Add(docNum,
|
||||||
|
uint64(loc.fieldID), loc.pos, loc.start, loc.end,
|
||||||
|
uint64(len(loc.arrayposs)))
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = locEncoder.Add(docNum, loc.arrayposs...)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
locOffset += freqNorm.numLocs
|
||||||
|
}
|
||||||
|
|
||||||
|
freqNormOffset++
|
||||||
|
|
||||||
|
docTermMap[docNum] = append(
|
||||||
|
append(docTermMap[docNum], term...),
|
||||||
|
termSeparator)
|
||||||
|
}
|
||||||
|
|
||||||
|
tfEncoder.Close()
|
||||||
|
locEncoder.Close()
|
||||||
|
|
||||||
|
postingsOffset, err :=
|
||||||
|
writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if postingsOffset > uint64(0) {
|
||||||
|
err = s.builder.Insert([]byte(term), postingsOffset)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tfEncoder.Reset()
|
||||||
|
locEncoder.Reset()
|
||||||
|
}
|
||||||
|
|
||||||
|
err = s.builder.Close()
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// record where this dictionary starts
|
||||||
|
dictOffsets[fieldID] = uint64(s.w.Count())
|
||||||
|
|
||||||
|
vellumData := s.builderBuf.Bytes()
|
||||||
|
|
||||||
|
// write out the length of the vellum data
|
||||||
|
n := binary.PutUvarint(buf, uint64(len(vellumData)))
|
||||||
|
_, err = s.w.Write(buf[:n])
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// write this vellum to disk
|
||||||
|
_, err = s.w.Write(vellumData)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset vellum for reuse
|
||||||
|
s.builderBuf.Reset()
|
||||||
|
|
||||||
|
err = s.builder.Reset(&s.builderBuf)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// write the field doc values
|
||||||
|
if s.IncludeDocValues[fieldID] {
|
||||||
|
for docNum, docTerms := range docTermMap {
|
||||||
|
if len(docTerms) > 0 {
|
||||||
|
err = fdvEncoder.Add(uint64(docNum), docTerms)
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
err = fdvEncoder.Close()
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
fdvOffsetsStart[fieldID] = uint64(s.w.Count())
|
||||||
|
|
||||||
|
_, err = fdvEncoder.Write()
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
fdvOffsetsEnd[fieldID] = uint64(s.w.Count())
|
||||||
|
|
||||||
|
fdvEncoder.Reset()
|
||||||
|
} else {
|
||||||
|
fdvOffsetsStart[fieldID] = fieldNotUninverted
|
||||||
|
fdvOffsetsEnd[fieldID] = fieldNotUninverted
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fdvIndexOffset = uint64(s.w.Count())
|
||||||
|
|
||||||
|
for i := 0; i < len(fdvOffsetsStart); i++ {
|
||||||
|
n := binary.PutUvarint(buf, fdvOffsetsStart[i])
|
||||||
|
_, err := s.w.Write(buf[:n])
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
n = binary.PutUvarint(buf, fdvOffsetsEnd[i])
|
||||||
|
_, err = s.w.Write(buf[:n])
|
||||||
|
if err != nil {
|
||||||
|
return 0, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return fdvIndexOffset, dictOffsets, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func encodeFieldType(f document.Field) byte {
|
||||||
|
fieldType := byte('x')
|
||||||
|
switch f.(type) {
|
||||||
|
case *document.TextField:
|
||||||
|
fieldType = 't'
|
||||||
|
case *document.NumericField:
|
||||||
|
fieldType = 'n'
|
||||||
|
case *document.DateTimeField:
|
||||||
|
fieldType = 'd'
|
||||||
|
case *document.BooleanField:
|
||||||
|
fieldType = 'b'
|
||||||
|
case *document.GeoPointField:
|
||||||
|
fieldType = 'g'
|
||||||
|
case *document.CompositeField:
|
||||||
|
fieldType = 'c'
|
||||||
|
}
|
||||||
|
return fieldType
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns the total # of bytes needed to encode the given uint64's
|
||||||
|
// into binary.PutUVarint() encoding
|
||||||
|
func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) {
|
||||||
|
n = numUvarintBytes(a)
|
||||||
|
n += numUvarintBytes(b)
|
||||||
|
n += numUvarintBytes(c)
|
||||||
|
n += numUvarintBytes(d)
|
||||||
|
n += numUvarintBytes(e)
|
||||||
|
for _, v := range more {
|
||||||
|
n += numUvarintBytes(v)
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns # of bytes needed to encode x in binary.PutUvarint() encoding
|
||||||
|
func numUvarintBytes(x uint64) (n int) {
|
||||||
|
for x >= 0x80 {
|
||||||
|
x >>= 7
|
||||||
|
n++
|
||||||
|
}
|
||||||
|
return n + 1
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
|
@ -20,16 +20,24 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"os"
|
"os"
|
||||||
|
"reflect"
|
||||||
"sync"
|
"sync"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
"github.com/Smerity/govarint"
|
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
"github.com/couchbase/vellum"
|
"github.com/couchbase/vellum"
|
||||||
mmap "github.com/edsrzf/mmap-go"
|
mmap "github.com/edsrzf/mmap-go"
|
||||||
"github.com/golang/snappy"
|
"github.com/golang/snappy"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeSegmentBase int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var sb SegmentBase
|
||||||
|
reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size())
|
||||||
|
}
|
||||||
|
|
||||||
// Open returns a zap impl of a segment
|
// Open returns a zap impl of a segment
|
||||||
func Open(path string) (segment.Segment, error) {
|
func Open(path string) (segment.Segment, error) {
|
||||||
f, err := os.Open(path)
|
f, err := os.Open(path)
|
||||||
|
@ -47,13 +55,14 @@ func Open(path string) (segment.Segment, error) {
|
||||||
SegmentBase: SegmentBase{
|
SegmentBase: SegmentBase{
|
||||||
mem: mm[0 : len(mm)-FooterSize],
|
mem: mm[0 : len(mm)-FooterSize],
|
||||||
fieldsMap: make(map[string]uint16),
|
fieldsMap: make(map[string]uint16),
|
||||||
fieldDvIterMap: make(map[uint16]*docValueIterator),
|
fieldDvReaders: make(map[uint16]*docValueReader),
|
||||||
},
|
},
|
||||||
f: f,
|
f: f,
|
||||||
mm: mm,
|
mm: mm,
|
||||||
path: path,
|
path: path,
|
||||||
refs: 1,
|
refs: 1,
|
||||||
}
|
}
|
||||||
|
rv.SegmentBase.updateSize()
|
||||||
|
|
||||||
err = rv.loadConfig()
|
err = rv.loadConfig()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -67,7 +76,7 @@ func Open(path string) (segment.Segment, error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
err = rv.loadDvIterators()
|
err = rv.loadDvReaders()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
_ = rv.Close()
|
_ = rv.Close()
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -89,7 +98,39 @@ type SegmentBase struct {
|
||||||
fieldsIndexOffset uint64
|
fieldsIndexOffset uint64
|
||||||
docValueOffset uint64
|
docValueOffset uint64
|
||||||
dictLocs []uint64
|
dictLocs []uint64
|
||||||
fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field
|
fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field
|
||||||
|
fieldDvNames []string // field names cached in fieldDvReaders
|
||||||
|
size uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sb *SegmentBase) Size() int {
|
||||||
|
return int(sb.size)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sb *SegmentBase) updateSize() {
|
||||||
|
sizeInBytes := reflectStaticSizeSegmentBase +
|
||||||
|
cap(sb.mem)
|
||||||
|
|
||||||
|
// fieldsMap
|
||||||
|
for k, _ := range sb.fieldsMap {
|
||||||
|
sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16
|
||||||
|
}
|
||||||
|
|
||||||
|
// fieldsInv, dictLocs
|
||||||
|
for _, entry := range sb.fieldsInv {
|
||||||
|
sizeInBytes += len(entry) + size.SizeOfString
|
||||||
|
}
|
||||||
|
sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64
|
||||||
|
|
||||||
|
// fieldDvReaders
|
||||||
|
for _, v := range sb.fieldDvReaders {
|
||||||
|
sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr
|
||||||
|
if v != nil {
|
||||||
|
sizeInBytes += v.size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sb.size = uint64(sizeInBytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sb *SegmentBase) AddRef() {}
|
func (sb *SegmentBase) AddRef() {}
|
||||||
|
@ -111,56 +152,19 @@ type Segment struct {
|
||||||
refs int64
|
refs int64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Segment) SizeInBytes() uint64 {
|
func (s *Segment) Size() int {
|
||||||
// 8 /* size of file pointer */
|
// 8 /* size of file pointer */
|
||||||
// 4 /* size of version -> uint32 */
|
// 4 /* size of version -> uint32 */
|
||||||
// 4 /* size of crc -> uint32 */
|
// 4 /* size of crc -> uint32 */
|
||||||
sizeOfUints := 16
|
sizeOfUints := 16
|
||||||
|
|
||||||
sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints
|
sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints
|
||||||
|
|
||||||
// mutex, refs -> int64
|
// mutex, refs -> int64
|
||||||
sizeInBytes += 16
|
sizeInBytes += 16
|
||||||
|
|
||||||
// do not include the mmap'ed part
|
// do not include the mmap'ed part
|
||||||
return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem))
|
return sizeInBytes + s.SegmentBase.Size() - cap(s.mem)
|
||||||
}
|
|
||||||
|
|
||||||
func (s *SegmentBase) SizeInBytes() uint64 {
|
|
||||||
// 4 /* size of memCRC -> uint32 */
|
|
||||||
// 4 /* size of chunkFactor -> uint32 */
|
|
||||||
// 8 /* size of numDocs -> uint64 */
|
|
||||||
// 8 /* size of storedIndexOffset -> uint64 */
|
|
||||||
// 8 /* size of fieldsIndexOffset -> uint64 */
|
|
||||||
// 8 /* size of docValueOffset -> uint64 */
|
|
||||||
sizeInBytes := 40
|
|
||||||
|
|
||||||
sizeInBytes += len(s.mem) + int(segment.SizeOfSlice)
|
|
||||||
|
|
||||||
// fieldsMap
|
|
||||||
for k, _ := range s.fieldsMap {
|
|
||||||
sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */
|
|
||||||
}
|
|
||||||
sizeInBytes += int(segment.SizeOfMap) /* overhead from map */
|
|
||||||
|
|
||||||
// fieldsInv, dictLocs
|
|
||||||
for _, entry := range s.fieldsInv {
|
|
||||||
sizeInBytes += (len(entry) + int(segment.SizeOfString))
|
|
||||||
}
|
|
||||||
sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */
|
|
||||||
sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */
|
|
||||||
|
|
||||||
// fieldDvIterMap
|
|
||||||
sizeInBytes += len(s.fieldDvIterMap) *
|
|
||||||
int(segment.SizeOfPointer+2 /* size of uint16 */)
|
|
||||||
for _, entry := range s.fieldDvIterMap {
|
|
||||||
if entry != nil {
|
|
||||||
sizeInBytes += int(entry.sizeInBytes())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sizeInBytes += int(segment.SizeOfMap)
|
|
||||||
|
|
||||||
return uint64(sizeInBytes)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Segment) AddRef() {
|
func (s *Segment) AddRef() {
|
||||||
|
@ -185,7 +189,7 @@ func (s *Segment) loadConfig() error {
|
||||||
|
|
||||||
verOffset := crcOffset - 4
|
verOffset := crcOffset - 4
|
||||||
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
|
s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4])
|
||||||
if s.version != version {
|
if s.version != Version {
|
||||||
return fmt.Errorf("unsupported version %d", s.version)
|
return fmt.Errorf("unsupported version %d", s.version)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -207,7 +211,7 @@ func (s *Segment) loadConfig() error {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SegmentBase) loadFields() error {
|
func (s *SegmentBase) loadFields() error {
|
||||||
// NOTE for now we assume the fields index immediately preceeds
|
// NOTE for now we assume the fields index immediately precedes
|
||||||
// the footer, and if this changes, need to adjust accordingly (or
|
// the footer, and if this changes, need to adjust accordingly (or
|
||||||
// store explicit length), where s.mem was sliced from s.mm in Open().
|
// store explicit length), where s.mem was sliced from s.mm in Open().
|
||||||
fieldsIndexEnd := uint64(len(s.mem))
|
fieldsIndexEnd := uint64(len(s.mem))
|
||||||
|
@ -262,6 +266,10 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
|
return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err)
|
||||||
}
|
}
|
||||||
|
rv.fstReader, err = rv.fst.Reader()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -269,50 +277,90 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) {
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// visitDocumentCtx holds data structures that are reusable across
|
||||||
|
// multiple VisitDocument() calls to avoid memory allocations
|
||||||
|
type visitDocumentCtx struct {
|
||||||
|
buf []byte
|
||||||
|
reader bytes.Reader
|
||||||
|
arrayPos []uint64
|
||||||
|
}
|
||||||
|
|
||||||
|
var visitDocumentCtxPool = sync.Pool{
|
||||||
|
New: func() interface{} {
|
||||||
|
reuse := &visitDocumentCtx{}
|
||||||
|
return reuse
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
// VisitDocument invokes the DocFieldValueVistor for each stored field
|
||||||
// for the specified doc number
|
// for the specified doc number
|
||||||
func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error {
|
||||||
|
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
|
||||||
|
defer visitDocumentCtxPool.Put(vdc)
|
||||||
|
return s.visitDocument(vdc, num, visitor)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64,
|
||||||
|
visitor segment.DocumentFieldValueVisitor) error {
|
||||||
// first make sure this is a valid number in this segment
|
// first make sure this is a valid number in this segment
|
||||||
if num < s.numDocs {
|
if num < s.numDocs {
|
||||||
meta, compressed := s.getDocStoredMetaAndCompressed(num)
|
meta, compressed := s.getDocStoredMetaAndCompressed(num)
|
||||||
uncompressed, err := snappy.Decode(nil, compressed)
|
|
||||||
|
vdc.reader.Reset(meta)
|
||||||
|
|
||||||
|
// handle _id field special case
|
||||||
|
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
idFieldVal := compressed[:idFieldValLen]
|
||||||
|
|
||||||
|
keepGoing := visitor("_id", byte('t'), idFieldVal, nil)
|
||||||
|
if !keepGoing {
|
||||||
|
visitDocumentCtxPool.Put(vdc)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle non-"_id" fields
|
||||||
|
compressed = compressed[idFieldValLen:]
|
||||||
|
|
||||||
|
uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// now decode meta and process
|
|
||||||
reader := bytes.NewReader(meta)
|
|
||||||
decoder := govarint.NewU64Base128Decoder(reader)
|
|
||||||
|
|
||||||
keepGoing := true
|
|
||||||
for keepGoing {
|
for keepGoing {
|
||||||
field, err := decoder.GetU64()
|
field, err := binary.ReadUvarint(&vdc.reader)
|
||||||
if err == io.EOF {
|
if err == io.EOF {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
typ, err := decoder.GetU64()
|
typ, err := binary.ReadUvarint(&vdc.reader)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
offset, err := decoder.GetU64()
|
offset, err := binary.ReadUvarint(&vdc.reader)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
l, err := decoder.GetU64()
|
l, err := binary.ReadUvarint(&vdc.reader)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
numap, err := decoder.GetU64()
|
numap, err := binary.ReadUvarint(&vdc.reader)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
var arrayPos []uint64
|
var arrayPos []uint64
|
||||||
if numap > 0 {
|
if numap > 0 {
|
||||||
arrayPos = make([]uint64, numap)
|
if cap(vdc.arrayPos) < int(numap) {
|
||||||
|
vdc.arrayPos = make([]uint64, numap)
|
||||||
|
}
|
||||||
|
arrayPos = vdc.arrayPos[:numap]
|
||||||
for i := 0; i < int(numap); i++ {
|
for i := 0; i < int(numap); i++ {
|
||||||
ap, err := decoder.GetU64()
|
ap, err := binary.ReadUvarint(&vdc.reader)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -323,10 +371,36 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal
|
||||||
value := uncompressed[offset : offset+l]
|
value := uncompressed[offset : offset+l]
|
||||||
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
|
keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vdc.buf = uncompressed
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DocID returns the value of the _id field for the given docNum
|
||||||
|
func (s *SegmentBase) DocID(num uint64) ([]byte, error) {
|
||||||
|
if num >= s.numDocs {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx)
|
||||||
|
|
||||||
|
meta, compressed := s.getDocStoredMetaAndCompressed(num)
|
||||||
|
|
||||||
|
vdc.reader.Reset(meta)
|
||||||
|
|
||||||
|
// handle _id field special case
|
||||||
|
idFieldValLen, err := binary.ReadUvarint(&vdc.reader)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
idFieldVal := compressed[:idFieldValLen]
|
||||||
|
|
||||||
|
visitDocumentCtxPool.Put(vdc)
|
||||||
|
|
||||||
|
return idFieldVal, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Count returns the number of documents in this segment.
|
// Count returns the number of documents in this segment.
|
||||||
func (s *SegmentBase) Count() uint64 {
|
func (s *SegmentBase) Count() uint64 {
|
||||||
return s.numDocs
|
return s.numDocs
|
||||||
|
@ -343,16 +417,27 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var postings *PostingsList
|
postingsList := emptyPostingsList
|
||||||
for _, id := range ids {
|
|
||||||
postings, err = idDict.postingsList([]byte(id), nil, postings)
|
sMax, err := idDict.fst.GetMaxKey()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if postings.postings != nil {
|
sMaxStr := string(sMax)
|
||||||
rv.Or(postings.postings)
|
filteredIds := make([]string, 0, len(ids))
|
||||||
|
for _, id := range ids {
|
||||||
|
if id <= sMaxStr {
|
||||||
|
filteredIds = append(filteredIds, id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for _, id := range filteredIds {
|
||||||
|
postingsList, err = idDict.postingsList([]byte(id), nil, postingsList)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
postingsList.OrInto(rv)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return rv, nil
|
return rv, nil
|
||||||
|
@ -441,19 +526,32 @@ func (s *Segment) DictAddr(field string) (uint64, error) {
|
||||||
return s.dictLocs[fieldIDPlus1-1], nil
|
return s.dictLocs[fieldIDPlus1-1], nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SegmentBase) loadDvIterators() error {
|
func (s *SegmentBase) loadDvReaders() error {
|
||||||
if s.docValueOffset == fieldNotUninverted {
|
if s.docValueOffset == fieldNotUninverted {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var read uint64
|
var read uint64
|
||||||
for fieldID, field := range s.fieldsInv {
|
for fieldID, field := range s.fieldsInv {
|
||||||
fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
|
var fieldLocStart, fieldLocEnd uint64
|
||||||
|
var n int
|
||||||
|
fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
|
||||||
if n <= 0 {
|
if n <= 0 {
|
||||||
return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID)
|
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID)
|
||||||
}
|
}
|
||||||
s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc)
|
|
||||||
read += uint64(n)
|
read += uint64(n)
|
||||||
|
fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64])
|
||||||
|
if n <= 0 {
|
||||||
|
return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID)
|
||||||
}
|
}
|
||||||
|
read += uint64(n)
|
||||||
|
|
||||||
|
fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd)
|
||||||
|
if fieldDvReader != nil {
|
||||||
|
s.fieldDvReaders[uint16(fieldID)] = fieldDvReader
|
||||||
|
s.fieldDvNames = append(s.fieldDvNames, field)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,6 @@
|
||||||
package zap
|
package zap
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"io"
|
"io"
|
||||||
|
|
||||||
|
@ -25,28 +24,29 @@ import (
|
||||||
// writes out the length of the roaring bitmap in bytes as varint
|
// writes out the length of the roaring bitmap in bytes as varint
|
||||||
// then writes out the roaring bitmap itself
|
// then writes out the roaring bitmap itself
|
||||||
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer,
|
func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer,
|
||||||
reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) {
|
reuseBufVarint []byte) (int, error) {
|
||||||
reuseBuf.Reset()
|
buf, err := r.ToBytes()
|
||||||
|
|
||||||
// write out postings list to memory so we know the len
|
|
||||||
postingsListLen, err := r.WriteTo(reuseBuf)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
var tw int
|
var tw int
|
||||||
// write out the length of this postings list
|
|
||||||
n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen))
|
// write out the length
|
||||||
|
n := binary.PutUvarint(reuseBufVarint, uint64(len(buf)))
|
||||||
nw, err := w.Write(reuseBufVarint[:n])
|
nw, err := w.Write(reuseBufVarint[:n])
|
||||||
tw += nw
|
tw += nw
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return tw, err
|
return tw, err
|
||||||
}
|
}
|
||||||
// write out the postings list itself
|
|
||||||
nw, err = w.Write(reuseBuf.Bytes())
|
// write out the roaring bytes
|
||||||
|
nw, err = w.Write(buf)
|
||||||
tw += nw
|
tw += nw
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return tw, err
|
return tw, err
|
||||||
}
|
}
|
||||||
|
|
||||||
return tw, nil
|
return tw, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -118,7 +118,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// write out 32-bit version
|
// write out 32-bit version
|
||||||
err = binary.Write(w, binary.BigEndian, version)
|
err = binary.Write(w, binary.BigEndian, Version)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,10 +15,10 @@
|
||||||
package scorch
|
package scorch
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
|
||||||
"container/heap"
|
"container/heap"
|
||||||
"encoding/binary"
|
"encoding/binary"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
@ -27,8 +27,13 @@ import (
|
||||||
"github.com/blevesearch/bleve/document"
|
"github.com/blevesearch/bleve/document"
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
|
"github.com/couchbase/vellum"
|
||||||
|
lev2 "github.com/couchbase/vellum/levenshtein2"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// re usable, threadsafe levenshtein builders
|
||||||
|
var lb1, lb2 *lev2.LevenshteinAutomatonBuilder
|
||||||
|
|
||||||
type asynchSegmentResult struct {
|
type asynchSegmentResult struct {
|
||||||
dictItr segment.DictionaryIterator
|
dictItr segment.DictionaryIterator
|
||||||
|
|
||||||
|
@ -40,15 +45,36 @@ type asynchSegmentResult struct {
|
||||||
err error
|
err error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var reflectStaticSizeIndexSnapshot int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var is interface{} = IndexSnapshot{}
|
||||||
|
reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size())
|
||||||
|
var err error
|
||||||
|
lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err))
|
||||||
|
}
|
||||||
|
lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true)
|
||||||
|
if err != nil {
|
||||||
|
panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
type IndexSnapshot struct {
|
type IndexSnapshot struct {
|
||||||
parent *Scorch
|
parent *Scorch
|
||||||
segment []*SegmentSnapshot
|
segment []*SegmentSnapshot
|
||||||
offsets []uint64
|
offsets []uint64
|
||||||
internal map[string][]byte
|
internal map[string][]byte
|
||||||
epoch uint64
|
epoch uint64
|
||||||
|
size uint64
|
||||||
|
creator string
|
||||||
|
|
||||||
m sync.Mutex // Protects the fields that follow.
|
m sync.Mutex // Protects the fields that follow.
|
||||||
refs int64
|
refs int64
|
||||||
|
|
||||||
|
m2 sync.Mutex // Protects the fields that follow.
|
||||||
|
fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshot) Segments() []*SegmentSnapshot {
|
func (i *IndexSnapshot) Segments() []*SegmentSnapshot {
|
||||||
|
@ -85,12 +111,27 @@ func (i *IndexSnapshot) DecRef() (err error) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) Close() error {
|
||||||
|
return i.DecRef()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) Size() int {
|
||||||
|
return int(i.size)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) updateSize() {
|
||||||
|
i.size += uint64(reflectStaticSizeIndexSnapshot)
|
||||||
|
for _, s := range i.segment {
|
||||||
|
i.size += uint64(s.Size())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) {
|
func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) {
|
||||||
|
|
||||||
results := make(chan *asynchSegmentResult)
|
results := make(chan *asynchSegmentResult)
|
||||||
for index, segment := range i.segment {
|
for index, segment := range i.segment {
|
||||||
go func(index int, segment *SegmentSnapshot) {
|
go func(index int, segment *SegmentSnapshot) {
|
||||||
dict, err := segment.Dictionary(field)
|
dict, err := segment.segment.Dictionary(field)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
results <- &asynchSegmentResult{err: err}
|
results <- &asynchSegmentResult{err: err}
|
||||||
} else {
|
} else {
|
||||||
|
@ -116,7 +157,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s
|
||||||
if next != nil {
|
if next != nil {
|
||||||
rv.cursors = append(rv.cursors, &segmentDictCursor{
|
rv.cursors = append(rv.cursors, &segmentDictCursor{
|
||||||
itr: asr.dictItr,
|
itr: asr.dictItr,
|
||||||
curr: next,
|
curr: *next,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -151,6 +192,56 @@ func (i *IndexSnapshot) FieldDictPrefix(field string,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) FieldDictRegexp(field string,
|
||||||
|
termRegex string) (index.FieldDict, error) {
|
||||||
|
// TODO: potential optimization where the literal prefix represents the,
|
||||||
|
// entire regexp, allowing us to use PrefixIterator(prefixTerm)?
|
||||||
|
|
||||||
|
a, prefixBeg, prefixEnd, err := segment.ParseRegexp(termRegex)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||||
|
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) getLevAutomaton(term string,
|
||||||
|
fuzziness uint8) (vellum.Automaton, error) {
|
||||||
|
if fuzziness == 1 {
|
||||||
|
return lb1.BuildDfa(term, fuzziness)
|
||||||
|
} else if fuzziness == 2 {
|
||||||
|
return lb2.BuildDfa(term, fuzziness)
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("fuzziness exceeds the max limit")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) FieldDictFuzzy(field string,
|
||||||
|
term string, fuzziness int, prefix string) (index.FieldDict, error) {
|
||||||
|
a, err := i.getLevAutomaton(term, uint8(fuzziness))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var prefixBeg, prefixEnd []byte
|
||||||
|
if prefix != "" {
|
||||||
|
prefixBeg = []byte(prefix)
|
||||||
|
prefixEnd = segment.IncrementBytes(prefixBeg)
|
||||||
|
}
|
||||||
|
|
||||||
|
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||||
|
return i.AutomatonIterator(a, prefixBeg, prefixEnd)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) FieldDictOnly(field string,
|
||||||
|
onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) {
|
||||||
|
return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator {
|
||||||
|
return i.OnlyIterator(onlyTerms, includeCount)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
|
func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) {
|
||||||
results := make(chan *asynchSegmentResult)
|
results := make(chan *asynchSegmentResult)
|
||||||
for index, segment := range i.segment {
|
for index, segment := range i.segment {
|
||||||
|
@ -264,21 +355,26 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) {
|
||||||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||||
|
|
||||||
rv = document.NewDocument(id)
|
rv = document.NewDocument(id)
|
||||||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool {
|
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, val []byte, pos []uint64) bool {
|
||||||
if name == "_id" {
|
if name == "_id" {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// copy value, array positions to preserve them beyond the scope of this callback
|
||||||
|
value := append([]byte(nil), val...)
|
||||||
|
arrayPos := append([]uint64(nil), pos...)
|
||||||
|
|
||||||
switch typ {
|
switch typ {
|
||||||
case 't':
|
case 't':
|
||||||
rv.AddField(document.NewTextField(name, pos, value))
|
rv.AddField(document.NewTextField(name, arrayPos, value))
|
||||||
case 'n':
|
case 'n':
|
||||||
rv.AddField(document.NewNumericFieldFromBytes(name, pos, value))
|
rv.AddField(document.NewNumericFieldFromBytes(name, arrayPos, value))
|
||||||
case 'd':
|
case 'd':
|
||||||
rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value))
|
rv.AddField(document.NewDateTimeFieldFromBytes(name, arrayPos, value))
|
||||||
case 'b':
|
case 'b':
|
||||||
rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value))
|
rv.AddField(document.NewBooleanFieldFromBytes(name, arrayPos, value))
|
||||||
case 'g':
|
case 'g':
|
||||||
rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value))
|
rv.AddField(document.NewGeoPointFieldFromBytes(name, arrayPos, value))
|
||||||
}
|
}
|
||||||
|
|
||||||
return true
|
return true
|
||||||
|
@ -307,24 +403,15 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
|
||||||
}
|
}
|
||||||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||||
|
|
||||||
var found bool
|
v, err := i.segment[segmentIndex].DocID(localDocNum)
|
||||||
var rv string
|
|
||||||
err = i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool {
|
|
||||||
if field == "_id" {
|
|
||||||
found = true
|
|
||||||
rv = string(value)
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
})
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
}
|
}
|
||||||
|
if v == nil {
|
||||||
if found {
|
|
||||||
return rv, nil
|
|
||||||
}
|
|
||||||
return "", fmt.Errorf("document number %d not found", docNum)
|
return "", fmt.Errorf("document number %d not found", docNum)
|
||||||
|
}
|
||||||
|
|
||||||
|
return string(v), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) {
|
func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) {
|
||||||
|
@ -349,33 +436,81 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err
|
||||||
|
|
||||||
func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
|
func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq,
|
||||||
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
|
includeNorm, includeTermVectors bool) (index.TermFieldReader, error) {
|
||||||
|
rv := i.allocTermFieldReaderDicts(field)
|
||||||
|
|
||||||
rv := &IndexSnapshotTermFieldReader{
|
rv.term = term
|
||||||
term: term,
|
rv.field = field
|
||||||
field: field,
|
rv.snapshot = i
|
||||||
snapshot: i,
|
if rv.postings == nil {
|
||||||
postings: make([]segment.PostingsList, len(i.segment)),
|
rv.postings = make([]segment.PostingsList, len(i.segment))
|
||||||
iterators: make([]segment.PostingsIterator, len(i.segment)),
|
|
||||||
includeFreq: includeFreq,
|
|
||||||
includeNorm: includeNorm,
|
|
||||||
includeTermVectors: includeTermVectors,
|
|
||||||
}
|
}
|
||||||
|
if rv.iterators == nil {
|
||||||
|
rv.iterators = make([]segment.PostingsIterator, len(i.segment))
|
||||||
|
}
|
||||||
|
rv.segmentOffset = 0
|
||||||
|
rv.includeFreq = includeFreq
|
||||||
|
rv.includeNorm = includeNorm
|
||||||
|
rv.includeTermVectors = includeTermVectors
|
||||||
|
rv.currPosting = nil
|
||||||
|
rv.currID = rv.currID[:0]
|
||||||
|
|
||||||
|
if rv.dicts == nil {
|
||||||
|
rv.dicts = make([]segment.TermDictionary, len(i.segment))
|
||||||
for i, segment := range i.segment {
|
for i, segment := range i.segment {
|
||||||
dict, err := segment.Dictionary(field)
|
dict, err := segment.segment.Dictionary(field)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
pl, err := dict.PostingsList(string(term), nil)
|
rv.dicts[i] = dict
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, segment := range i.segment {
|
||||||
|
pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i])
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
rv.postings[i] = pl
|
rv.postings[i] = pl
|
||||||
rv.iterators[i] = pl.Iterator()
|
rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i])
|
||||||
}
|
}
|
||||||
atomic.AddUint64(&i.parent.stats.termSearchersStarted, uint64(1))
|
atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1))
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnapshotTermFieldReader) {
|
||||||
|
i.m2.Lock()
|
||||||
|
if i.fieldTFRs != nil {
|
||||||
|
tfrs := i.fieldTFRs[field]
|
||||||
|
last := len(tfrs) - 1
|
||||||
|
if last >= 0 {
|
||||||
|
tfr = tfrs[last]
|
||||||
|
tfrs[last] = nil
|
||||||
|
i.fieldTFRs[field] = tfrs[:last]
|
||||||
|
i.m2.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i.m2.Unlock()
|
||||||
|
return &IndexSnapshotTermFieldReader{}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) {
|
||||||
|
i.parent.rootLock.RLock()
|
||||||
|
obsolete := i.parent.root != i
|
||||||
|
i.parent.rootLock.RUnlock()
|
||||||
|
if obsolete {
|
||||||
|
// if we're not the current root (mutations happened), don't bother recycling
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
i.m2.Lock()
|
||||||
|
if i.fieldTFRs == nil {
|
||||||
|
i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{}
|
||||||
|
}
|
||||||
|
i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr)
|
||||||
|
i.m2.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
func docNumberToBytes(buf []byte, in uint64) []byte {
|
func docNumberToBytes(buf []byte, in uint64) []byte {
|
||||||
if len(buf) != 8 {
|
if len(buf) != 8 {
|
||||||
if cap(buf) >= 8 {
|
if cap(buf) >= 8 {
|
||||||
|
@ -389,115 +524,172 @@ func docNumberToBytes(buf []byte, in uint64) []byte {
|
||||||
}
|
}
|
||||||
|
|
||||||
func docInternalToNumber(in index.IndexInternalID) (uint64, error) {
|
func docInternalToNumber(in index.IndexInternalID) (uint64, error) {
|
||||||
var res uint64
|
if len(in) != 8 {
|
||||||
err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res)
|
return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in)
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
}
|
||||||
return res, nil
|
return binary.BigEndian.Uint64(in), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID,
|
func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID,
|
||||||
fields []string, visitor index.DocumentFieldTermVisitor) error {
|
fields []string, visitor index.DocumentFieldTermVisitor) error {
|
||||||
|
_, err := i.documentVisitFieldTerms(id, fields, visitor, nil)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID,
|
||||||
|
fields []string, visitor index.DocumentFieldTermVisitor,
|
||||||
|
dvs segment.DocVisitState) (segment.DocVisitState, error) {
|
||||||
|
docNum, err := docInternalToNumber(id)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||||
|
if segmentIndex >= len(i.segment) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
_, dvs, err = i.documentVisitFieldTermsOnSegment(
|
||||||
|
segmentIndex, localDocNum, fields, nil, visitor, dvs)
|
||||||
|
|
||||||
|
return dvs, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) documentVisitFieldTermsOnSegment(
|
||||||
|
segmentIndex int, localDocNum uint64, fields []string, cFields []string,
|
||||||
|
visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) (
|
||||||
|
cFieldsOut []string, dvsOut segment.DocVisitState, err error) {
|
||||||
|
ss := i.segment[segmentIndex]
|
||||||
|
|
||||||
|
var vFields []string // fields that are visitable via the segment
|
||||||
|
|
||||||
|
ssv, ssvOk := ss.segment.(segment.DocumentFieldTermVisitable)
|
||||||
|
if ssvOk && ssv != nil {
|
||||||
|
vFields, err = ssv.VisitableDocValueFields()
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var errCh chan error
|
||||||
|
|
||||||
|
// cFields represents the fields that we'll need from the
|
||||||
|
// cachedDocs, and might be optionally be provided by the caller,
|
||||||
|
// if the caller happens to know we're on the same segmentIndex
|
||||||
|
// from a previous invocation
|
||||||
|
if cFields == nil {
|
||||||
|
cFields = subtractStrings(fields, vFields)
|
||||||
|
|
||||||
|
if !ss.cachedDocs.hasFields(cFields) {
|
||||||
|
errCh = make(chan error, 1)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
err := ss.cachedDocs.prepareFields(cFields, ss)
|
||||||
|
if err != nil {
|
||||||
|
errCh <- err
|
||||||
|
}
|
||||||
|
close(errCh)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ssvOk && ssv != nil && len(vFields) > 0 {
|
||||||
|
dvs, err = ssv.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs)
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if errCh != nil {
|
||||||
|
err = <-errCh
|
||||||
|
if err != nil {
|
||||||
|
return nil, nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(cFields) > 0 {
|
||||||
|
ss.cachedDocs.visitDoc(localDocNum, cFields, visitor)
|
||||||
|
}
|
||||||
|
|
||||||
|
return cFields, dvs, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) DocValueReader(fields []string) (
|
||||||
|
index.DocValueReader, error) {
|
||||||
|
return &DocValueReader{i: i, fields: fields, currSegmentIndex: -1}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type DocValueReader struct {
|
||||||
|
i *IndexSnapshot
|
||||||
|
fields []string
|
||||||
|
dvs segment.DocVisitState
|
||||||
|
|
||||||
|
currSegmentIndex int
|
||||||
|
currCachedFields []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
|
||||||
|
visitor index.DocumentFieldTermVisitor) (err error) {
|
||||||
docNum, err := docInternalToNumber(id)
|
docNum, err := docInternalToNumber(id)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
|
||||||
if segmentIndex >= len(i.segment) {
|
segmentIndex, localDocNum := dvr.i.segmentIndexAndLocalDocNumFromGlobal(docNum)
|
||||||
|
if segmentIndex >= len(dvr.i.segment) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
ss := i.segment[segmentIndex]
|
if dvr.currSegmentIndex != segmentIndex {
|
||||||
|
dvr.currSegmentIndex = segmentIndex
|
||||||
|
dvr.currCachedFields = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
dvr.currCachedFields, dvr.dvs, err = dvr.i.documentVisitFieldTermsOnSegment(
|
||||||
|
dvr.currSegmentIndex, localDocNum, dvr.fields, dvr.currCachedFields, visitor, dvr.dvs)
|
||||||
|
|
||||||
if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok {
|
|
||||||
// get the list of doc value persisted fields
|
|
||||||
pFields, err := zaps.VisitableDocValueFields()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// assort the fields for which terms look up have to
|
|
||||||
// be performed runtime
|
|
||||||
dvPendingFields := extractDvPendingFields(fields, pFields)
|
|
||||||
if len(dvPendingFields) == 0 {
|
|
||||||
// all fields are doc value persisted
|
|
||||||
return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
|
|
||||||
}
|
|
||||||
|
|
||||||
// concurrently trigger the runtime doc value preparations for
|
|
||||||
// pending fields as well as the visit of the persisted doc values
|
|
||||||
errCh := make(chan error, 1)
|
|
||||||
|
|
||||||
|
func (i *IndexSnapshot) DumpAll() chan interface{} {
|
||||||
|
rv := make(chan interface{})
|
||||||
go func() {
|
go func() {
|
||||||
defer close(errCh)
|
close(rv)
|
||||||
err := ss.cachedDocs.prepareFields(fields, ss)
|
|
||||||
if err != nil {
|
|
||||||
errCh <- err
|
|
||||||
}
|
|
||||||
}()
|
}()
|
||||||
|
return rv
|
||||||
// visit the persisted dv while the cache preparation is in progress
|
|
||||||
err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// err out if fieldCache preparation failed
|
|
||||||
err = <-errCh
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string,
|
func (i *IndexSnapshot) DumpDoc(id string) chan interface{} {
|
||||||
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error {
|
rv := make(chan interface{})
|
||||||
err := ss.cachedDocs.prepareFields(fields, ss)
|
go func() {
|
||||||
if err != nil {
|
close(rv)
|
||||||
return err
|
}()
|
||||||
}
|
return rv
|
||||||
|
|
||||||
visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor)
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string,
|
func (i *IndexSnapshot) DumpFields() chan interface{} {
|
||||||
ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) {
|
rv := make(chan interface{})
|
||||||
|
go func() {
|
||||||
for _, field := range fields {
|
close(rv)
|
||||||
if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists {
|
}()
|
||||||
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
|
return rv
|
||||||
for {
|
|
||||||
i := bytes.Index(tlist, TermSeparatorSplitSlice)
|
|
||||||
if i < 0 {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
visitor(field, tlist[0:i])
|
|
||||||
tlist = tlist[i+1:]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func extractDvPendingFields(requestedFields, persistedFields []string) []string {
|
// subtractStrings returns set a minus elements of set b.
|
||||||
removeMap := map[string]struct{}{}
|
func subtractStrings(a, b []string) []string {
|
||||||
for _, str := range persistedFields {
|
if len(b) == 0 {
|
||||||
removeMap[str] = struct{}{}
|
return a
|
||||||
}
|
}
|
||||||
|
|
||||||
rv := make([]string, 0, len(requestedFields))
|
rv := make([]string, 0, len(a))
|
||||||
for _, s := range requestedFields {
|
OUTER:
|
||||||
if _, ok := removeMap[s]; !ok {
|
for _, as := range a {
|
||||||
rv = append(rv, s)
|
for _, bs := range b {
|
||||||
|
if as == bs {
|
||||||
|
continue OUTER
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
rv = append(rv, as)
|
||||||
|
}
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,12 +23,13 @@ import (
|
||||||
|
|
||||||
type segmentDictCursor struct {
|
type segmentDictCursor struct {
|
||||||
itr segment.DictionaryIterator
|
itr segment.DictionaryIterator
|
||||||
curr *index.DictEntry
|
curr index.DictEntry
|
||||||
}
|
}
|
||||||
|
|
||||||
type IndexSnapshotFieldDict struct {
|
type IndexSnapshotFieldDict struct {
|
||||||
snapshot *IndexSnapshot
|
snapshot *IndexSnapshot
|
||||||
cursors []*segmentDictCursor
|
cursors []*segmentDictCursor
|
||||||
|
entry index.DictEntry
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) }
|
func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) }
|
||||||
|
@ -51,10 +52,10 @@ func (i *IndexSnapshotFieldDict) Pop() interface{} {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
||||||
if len(i.cursors) <= 0 {
|
if len(i.cursors) == 0 {
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
rv := i.cursors[0].curr
|
i.entry = i.cursors[0].curr
|
||||||
next, err := i.cursors[0].itr.Next()
|
next, err := i.cursors[0].itr.Next()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -64,12 +65,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
||||||
heap.Pop(i)
|
heap.Pop(i)
|
||||||
} else {
|
} else {
|
||||||
// modified heap, fix it
|
// modified heap, fix it
|
||||||
i.cursors[0].curr = next
|
i.cursors[0].curr = *next
|
||||||
heap.Fix(i, 0)
|
heap.Fix(i, 0)
|
||||||
}
|
}
|
||||||
// look for any other entries with the exact same term
|
// look for any other entries with the exact same term
|
||||||
for len(i.cursors) > 0 && i.cursors[0].curr.Term == rv.Term {
|
for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term {
|
||||||
rv.Count += i.cursors[0].curr.Count
|
i.entry.Count += i.cursors[0].curr.Count
|
||||||
next, err := i.cursors[0].itr.Next()
|
next, err := i.cursors[0].itr.Next()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -79,12 +80,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) {
|
||||||
heap.Pop(i)
|
heap.Pop(i)
|
||||||
} else {
|
} else {
|
||||||
// modified heap, fix it
|
// modified heap, fix it
|
||||||
i.cursors[0].curr = next
|
i.cursors[0].curr = *next
|
||||||
heap.Fix(i, 0)
|
heap.Fix(i, 0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return rv, nil
|
return &i.entry, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshotFieldDict) Close() error {
|
func (i *IndexSnapshotFieldDict) Close() error {
|
||||||
|
|
|
@ -16,17 +16,30 @@ package scorch
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeIndexSnapshotDocIDReader int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var isdr IndexSnapshotDocIDReader
|
||||||
|
reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type IndexSnapshotDocIDReader struct {
|
type IndexSnapshotDocIDReader struct {
|
||||||
snapshot *IndexSnapshot
|
snapshot *IndexSnapshot
|
||||||
iterators []roaring.IntIterable
|
iterators []roaring.IntIterable
|
||||||
segmentOffset int
|
segmentOffset int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshotDocIDReader) Size() int {
|
||||||
|
return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr
|
||||||
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) {
|
func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) {
|
||||||
for i.segmentOffset < len(i.iterators) {
|
for i.segmentOffset < len(i.iterators) {
|
||||||
if !i.iterators[i.segmentOffset].HasNext() {
|
if !i.iterators[i.segmentOffset].HasNext() {
|
||||||
|
|
|
@ -16,16 +16,27 @@ package scorch
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"reflect"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeIndexSnapshotTermFieldReader int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var istfr IndexSnapshotTermFieldReader
|
||||||
|
reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type IndexSnapshotTermFieldReader struct {
|
type IndexSnapshotTermFieldReader struct {
|
||||||
term []byte
|
term []byte
|
||||||
field string
|
field string
|
||||||
snapshot *IndexSnapshot
|
snapshot *IndexSnapshot
|
||||||
|
dicts []segment.TermDictionary
|
||||||
postings []segment.PostingsList
|
postings []segment.PostingsList
|
||||||
iterators []segment.PostingsIterator
|
iterators []segment.PostingsIterator
|
||||||
segmentOffset int
|
segmentOffset int
|
||||||
|
@ -36,13 +47,34 @@ type IndexSnapshotTermFieldReader struct {
|
||||||
currID index.IndexInternalID
|
currID index.IndexInternalID
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *IndexSnapshotTermFieldReader) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr +
|
||||||
|
len(i.term) +
|
||||||
|
len(i.field) +
|
||||||
|
len(i.currID)
|
||||||
|
|
||||||
|
for _, entry := range i.postings {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range i.iterators {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
if i.currPosting != nil {
|
||||||
|
sizeInBytes += i.currPosting.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
|
func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) {
|
||||||
rv := preAlloced
|
rv := preAlloced
|
||||||
if rv == nil {
|
if rv == nil {
|
||||||
rv = &index.TermFieldDoc{}
|
rv = &index.TermFieldDoc{}
|
||||||
}
|
}
|
||||||
// find the next hit
|
// find the next hit
|
||||||
for i.segmentOffset < len(i.postings) {
|
for i.segmentOffset < len(i.iterators) {
|
||||||
next, err := i.iterators[i.segmentOffset].Next()
|
next, err := i.iterators[i.segmentOffset].Next()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -72,9 +104,16 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin
|
||||||
}
|
}
|
||||||
if i.includeTermVectors {
|
if i.includeTermVectors {
|
||||||
locs := next.Locations()
|
locs := next.Locations()
|
||||||
|
if cap(rv.Vectors) < len(locs) {
|
||||||
rv.Vectors = make([]*index.TermFieldVector, len(locs))
|
rv.Vectors = make([]*index.TermFieldVector, len(locs))
|
||||||
|
backing := make([]index.TermFieldVector, len(locs))
|
||||||
|
for i := range backing {
|
||||||
|
rv.Vectors[i] = &backing[i]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rv.Vectors = rv.Vectors[:len(locs)]
|
||||||
for i, loc := range locs {
|
for i, loc := range locs {
|
||||||
rv.Vectors[i] = &index.TermFieldVector{
|
*rv.Vectors[i] = index.TermFieldVector{
|
||||||
Start: loc.Start(),
|
Start: loc.Start(),
|
||||||
End: loc.End(),
|
End: loc.End(),
|
||||||
Pos: loc.Pos(),
|
Pos: loc.Pos(),
|
||||||
|
@ -96,24 +135,37 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo
|
||||||
}
|
}
|
||||||
*i = *(i2.(*IndexSnapshotTermFieldReader))
|
*i = *(i2.(*IndexSnapshotTermFieldReader))
|
||||||
}
|
}
|
||||||
// FIXME do something better
|
num, err := docInternalToNumber(ID)
|
||||||
next, err := i.Next(preAlloced)
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err)
|
||||||
|
}
|
||||||
|
segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num)
|
||||||
|
if segIndex >= len(i.snapshot.segment) {
|
||||||
|
return nil, fmt.Errorf("computed segment index %d out of bounds %d",
|
||||||
|
segIndex, len(i.snapshot.segment))
|
||||||
|
}
|
||||||
|
// skip directly to the target segment
|
||||||
|
i.segmentOffset = segIndex
|
||||||
|
next, err := i.iterators[i.segmentOffset].Advance(ldocNum)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
if next == nil {
|
if next == nil {
|
||||||
return nil, nil
|
// we jumped directly to the segment that should have contained it
|
||||||
|
// but it wasn't there, so reuse Next() which should correctly
|
||||||
|
// get the next hit after it (we moved i.segmentOffset)
|
||||||
|
return i.Next(preAlloced)
|
||||||
}
|
}
|
||||||
for bytes.Compare(next.ID, ID) < 0 {
|
|
||||||
next, err = i.Next(preAlloced)
|
if preAlloced == nil {
|
||||||
if err != nil {
|
preAlloced = &index.TermFieldDoc{}
|
||||||
return nil, err
|
|
||||||
}
|
}
|
||||||
if next == nil {
|
preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+
|
||||||
break
|
i.snapshot.offsets[segIndex])
|
||||||
}
|
i.postingToTermFieldDoc(next, preAlloced)
|
||||||
}
|
i.currID = preAlloced.ID
|
||||||
return next, nil
|
i.currPosting = next
|
||||||
|
return preAlloced, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (i *IndexSnapshotTermFieldReader) Count() uint64 {
|
func (i *IndexSnapshotTermFieldReader) Count() uint64 {
|
||||||
|
@ -126,7 +178,8 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 {
|
||||||
|
|
||||||
func (i *IndexSnapshotTermFieldReader) Close() error {
|
func (i *IndexSnapshotTermFieldReader) Close() error {
|
||||||
if i.snapshot != nil {
|
if i.snapshot != nil {
|
||||||
atomic.AddUint64(&i.snapshot.parent.stats.termSearchersFinished, uint64(1))
|
atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1))
|
||||||
|
i.snapshot.recycleTermFieldReader(i)
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,7 @@ import (
|
||||||
"log"
|
"log"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
"github.com/boltdb/bolt"
|
bolt "github.com/etcd-io/bbolt"
|
||||||
)
|
)
|
||||||
|
|
||||||
type RollbackPoint struct {
|
type RollbackPoint struct {
|
||||||
|
|
|
@ -15,42 +15,25 @@
|
||||||
package scorch
|
package scorch
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
"github.com/RoaringBitmap/roaring"
|
"github.com/RoaringBitmap/roaring"
|
||||||
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/scorch/segment"
|
"github.com/blevesearch/bleve/index/scorch/segment"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
var TermSeparator byte = 0xff
|
var TermSeparator byte = 0xff
|
||||||
|
|
||||||
var TermSeparatorSplitSlice = []byte{TermSeparator}
|
var TermSeparatorSplitSlice = []byte{TermSeparator}
|
||||||
|
|
||||||
type SegmentDictionarySnapshot struct {
|
|
||||||
s *SegmentSnapshot
|
|
||||||
d segment.TermDictionary
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) {
|
|
||||||
// TODO: if except is non-nil, perhaps need to OR it with s.s.deleted?
|
|
||||||
return s.d.PostingsList(term, s.s.deleted)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator {
|
|
||||||
return s.d.Iterator()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator {
|
|
||||||
return s.d.PrefixIterator(prefix)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator {
|
|
||||||
return s.d.RangeIterator(start, end)
|
|
||||||
}
|
|
||||||
|
|
||||||
type SegmentSnapshot struct {
|
type SegmentSnapshot struct {
|
||||||
id uint64
|
id uint64
|
||||||
segment segment.Segment
|
segment segment.Segment
|
||||||
deleted *roaring.Bitmap
|
deleted *roaring.Bitmap
|
||||||
|
creator string
|
||||||
|
|
||||||
cachedDocs *cachedDocs
|
cachedDocs *cachedDocs
|
||||||
}
|
}
|
||||||
|
@ -83,8 +66,11 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel
|
||||||
return s.segment.VisitDocument(num, visitor)
|
return s.segment.VisitDocument(num, visitor)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SegmentSnapshot) Count() uint64 {
|
func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) {
|
||||||
|
return s.segment.DocID(num)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *SegmentSnapshot) Count() uint64 {
|
||||||
rv := s.segment.Count()
|
rv := s.segment.Count()
|
||||||
if s.deleted != nil {
|
if s.deleted != nil {
|
||||||
rv -= s.deleted.GetCardinality()
|
rv -= s.deleted.GetCardinality()
|
||||||
|
@ -92,17 +78,6 @@ func (s *SegmentSnapshot) Count() uint64 {
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) {
|
|
||||||
d, err := s.segment.Dictionary(field)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return &SegmentDictionarySnapshot{
|
|
||||||
s: s,
|
|
||||||
d: d,
|
|
||||||
}, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
|
func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
|
||||||
rv, err := s.segment.DocNumbers(docIDs)
|
rv, err := s.segment.DocNumbers(docIDs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -114,7 +89,7 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) {
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// DocNumbersLive returns bitsit containing doc numbers for all live docs
|
// DocNumbersLive returns a bitmap containing doc numbers for all live docs
|
||||||
func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap {
|
func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap {
|
||||||
rv := roaring.NewBitmap()
|
rv := roaring.NewBitmap()
|
||||||
rv.AddRange(0, s.segment.Count())
|
rv.AddRange(0, s.segment.Count())
|
||||||
|
@ -128,36 +103,68 @@ func (s *SegmentSnapshot) Fields() []string {
|
||||||
return s.segment.Fields()
|
return s.segment.Fields()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *SegmentSnapshot) Size() (rv int) {
|
||||||
|
rv = s.segment.Size()
|
||||||
|
if s.deleted != nil {
|
||||||
|
rv += int(s.deleted.GetSizeInBytes())
|
||||||
|
}
|
||||||
|
rv += s.cachedDocs.Size()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
type cachedFieldDocs struct {
|
type cachedFieldDocs struct {
|
||||||
|
m sync.Mutex
|
||||||
readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used.
|
readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used.
|
||||||
err error // Non-nil if there was an error when preparing this cachedFieldDocs.
|
err error // Non-nil if there was an error when preparing this cachedFieldDocs.
|
||||||
docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF.
|
docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF.
|
||||||
|
size uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) {
|
func (cfd *cachedFieldDocs) Size() int {
|
||||||
defer close(cfd.readyCh)
|
var rv int
|
||||||
|
cfd.m.Lock()
|
||||||
|
for _, entry := range cfd.docs {
|
||||||
|
rv += 8 /* size of uint64 */ + len(entry)
|
||||||
|
}
|
||||||
|
cfd.m.Unlock()
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) {
|
||||||
|
cfd.m.Lock()
|
||||||
|
defer func() {
|
||||||
|
close(cfd.readyCh)
|
||||||
|
cfd.m.Unlock()
|
||||||
|
}()
|
||||||
|
|
||||||
|
cfd.size += uint64(size.SizeOfUint64) /* size field */
|
||||||
dict, err := ss.segment.Dictionary(field)
|
dict, err := ss.segment.Dictionary(field)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
cfd.err = err
|
cfd.err = err
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var postings segment.PostingsList
|
||||||
|
var postingsItr segment.PostingsIterator
|
||||||
|
|
||||||
dictItr := dict.Iterator()
|
dictItr := dict.Iterator()
|
||||||
next, err := dictItr.Next()
|
next, err := dictItr.Next()
|
||||||
for err == nil && next != nil {
|
for err == nil && next != nil {
|
||||||
postings, err1 := dict.PostingsList(next.Term, nil)
|
var err1 error
|
||||||
|
postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings)
|
||||||
if err1 != nil {
|
if err1 != nil {
|
||||||
cfd.err = err1
|
cfd.err = err1
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
postingsItr := postings.Iterator()
|
cfd.size += uint64(size.SizeOfUint64) /* map key */
|
||||||
|
postingsItr = postings.Iterator(false, false, false, postingsItr)
|
||||||
nextPosting, err2 := postingsItr.Next()
|
nextPosting, err2 := postingsItr.Next()
|
||||||
for err2 == nil && nextPosting != nil {
|
for err2 == nil && nextPosting != nil {
|
||||||
docNum := nextPosting.Number()
|
docNum := nextPosting.Number()
|
||||||
cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...)
|
cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...)
|
||||||
cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator)
|
cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator)
|
||||||
|
cfd.size += uint64(len(next.Term) + 1) // map value
|
||||||
nextPosting, err2 = postingsItr.Next()
|
nextPosting, err2 = postingsItr.Next()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -178,10 +185,12 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) {
|
||||||
type cachedDocs struct {
|
type cachedDocs struct {
|
||||||
m sync.Mutex // As the cache is asynchronously prepared, need a lock
|
m sync.Mutex // As the cache is asynchronously prepared, need a lock
|
||||||
cache map[string]*cachedFieldDocs // Keyed by field
|
cache map[string]*cachedFieldDocs // Keyed by field
|
||||||
|
size uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error {
|
func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error {
|
||||||
c.m.Lock()
|
c.m.Lock()
|
||||||
|
|
||||||
if c.cache == nil {
|
if c.cache == nil {
|
||||||
c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields()))
|
c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields()))
|
||||||
}
|
}
|
||||||
|
@ -194,7 +203,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e
|
||||||
docs: make(map[uint64][]byte),
|
docs: make(map[uint64][]byte),
|
||||||
}
|
}
|
||||||
|
|
||||||
go c.cache[field].prepareFields(field, ss)
|
go c.cache[field].prepareField(field, ss)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -209,21 +218,62 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e
|
||||||
c.m.Lock()
|
c.m.Lock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
c.updateSizeLOCKED()
|
||||||
|
|
||||||
c.m.Unlock()
|
c.m.Unlock()
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *cachedDocs) sizeInBytes() uint64 {
|
// hasFields returns true if the cache has all the given fields
|
||||||
sizeInBytes := 0
|
func (c *cachedDocs) hasFields(fields []string) bool {
|
||||||
c.m.Lock()
|
c.m.Lock()
|
||||||
for k, v := range c.cache { // cachedFieldDocs
|
for _, field := range fields {
|
||||||
sizeInBytes += len(k)
|
if _, exists := c.cache[field]; !exists {
|
||||||
if v != nil {
|
c.m.Unlock()
|
||||||
for _, entry := range v.docs { // docs
|
return false // found a field not in cache
|
||||||
sizeInBytes += 8 /* size of uint64 */ + len(entry)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
c.m.Unlock()
|
c.m.Unlock()
|
||||||
return uint64(sizeInBytes)
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *cachedDocs) Size() int {
|
||||||
|
return int(atomic.LoadUint64(&c.size))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *cachedDocs) updateSizeLOCKED() {
|
||||||
|
sizeInBytes := 0
|
||||||
|
for k, v := range c.cache { // cachedFieldDocs
|
||||||
|
sizeInBytes += len(k)
|
||||||
|
if v != nil {
|
||||||
|
sizeInBytes += v.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
atomic.StoreUint64(&c.size, uint64(sizeInBytes))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *cachedDocs) visitDoc(localDocNum uint64,
|
||||||
|
fields []string, visitor index.DocumentFieldTermVisitor) {
|
||||||
|
c.m.Lock()
|
||||||
|
|
||||||
|
for _, field := range fields {
|
||||||
|
if cachedFieldDocs, exists := c.cache[field]; exists {
|
||||||
|
c.m.Unlock()
|
||||||
|
<-cachedFieldDocs.readyCh
|
||||||
|
c.m.Lock()
|
||||||
|
|
||||||
|
if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists {
|
||||||
|
for {
|
||||||
|
i := bytes.Index(tlist, TermSeparatorSplitSlice)
|
||||||
|
if i < 0 {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
visitor(field, tlist[0:i])
|
||||||
|
tlist = tlist[i+1:]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c.m.Unlock()
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,63 +16,125 @@ package scorch
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"io/ioutil"
|
"reflect"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Stats tracks statistics about the index
|
// Stats tracks statistics about the index, fields that are
|
||||||
|
// prefixed like CurXxxx are gauges (can go up and down),
|
||||||
|
// and fields that are prefixed like TotXxxx are monotonically
|
||||||
|
// increasing counters.
|
||||||
type Stats struct {
|
type Stats struct {
|
||||||
updates, deletes, batches, errors uint64
|
TotUpdates uint64
|
||||||
analysisTime, indexTime uint64
|
TotDeletes uint64
|
||||||
termSearchersStarted uint64
|
|
||||||
termSearchersFinished uint64
|
TotBatches uint64
|
||||||
numPlainTextBytesIndexed uint64
|
TotBatchesEmpty uint64
|
||||||
numItemsIntroduced uint64
|
TotBatchIntroTime uint64
|
||||||
numItemsPersisted uint64
|
MaxBatchIntroTime uint64
|
||||||
i *Scorch
|
|
||||||
|
CurRootEpoch uint64
|
||||||
|
LastPersistedEpoch uint64
|
||||||
|
LastMergedEpoch uint64
|
||||||
|
|
||||||
|
TotOnErrors uint64
|
||||||
|
|
||||||
|
TotAnalysisTime uint64
|
||||||
|
TotIndexTime uint64
|
||||||
|
|
||||||
|
TotIndexedPlainTextBytes uint64
|
||||||
|
|
||||||
|
TotTermSearchersStarted uint64
|
||||||
|
TotTermSearchersFinished uint64
|
||||||
|
|
||||||
|
TotIntroduceLoop uint64
|
||||||
|
TotIntroduceSegmentBeg uint64
|
||||||
|
TotIntroduceSegmentEnd uint64
|
||||||
|
TotIntroducePersistBeg uint64
|
||||||
|
TotIntroducePersistEnd uint64
|
||||||
|
TotIntroduceMergeBeg uint64
|
||||||
|
TotIntroduceMergeEnd uint64
|
||||||
|
TotIntroduceRevertBeg uint64
|
||||||
|
TotIntroduceRevertEnd uint64
|
||||||
|
|
||||||
|
TotIntroducedItems uint64
|
||||||
|
TotIntroducedSegmentsBatch uint64
|
||||||
|
TotIntroducedSegmentsMerge uint64
|
||||||
|
|
||||||
|
TotPersistLoopBeg uint64
|
||||||
|
TotPersistLoopErr uint64
|
||||||
|
TotPersistLoopProgress uint64
|
||||||
|
TotPersistLoopWait uint64
|
||||||
|
TotPersistLoopWaitNotified uint64
|
||||||
|
TotPersistLoopEnd uint64
|
||||||
|
|
||||||
|
TotPersistedItems uint64
|
||||||
|
TotItemsToPersist uint64
|
||||||
|
TotPersistedSegments uint64
|
||||||
|
|
||||||
|
TotPersisterSlowMergerPause uint64
|
||||||
|
TotPersisterSlowMergerResume uint64
|
||||||
|
|
||||||
|
TotPersisterNapPauseCompleted uint64
|
||||||
|
TotPersisterMergerNapBreak uint64
|
||||||
|
|
||||||
|
TotFileMergeLoopBeg uint64
|
||||||
|
TotFileMergeLoopErr uint64
|
||||||
|
TotFileMergeLoopEnd uint64
|
||||||
|
|
||||||
|
TotFileMergePlan uint64
|
||||||
|
TotFileMergePlanErr uint64
|
||||||
|
TotFileMergePlanNone uint64
|
||||||
|
TotFileMergePlanOk uint64
|
||||||
|
|
||||||
|
TotFileMergePlanTasks uint64
|
||||||
|
TotFileMergePlanTasksDone uint64
|
||||||
|
TotFileMergePlanTasksErr uint64
|
||||||
|
TotFileMergePlanTasksSegments uint64
|
||||||
|
TotFileMergePlanTasksSegmentsEmpty uint64
|
||||||
|
|
||||||
|
TotFileMergeSegmentsEmpty uint64
|
||||||
|
TotFileMergeSegments uint64
|
||||||
|
TotFileSegmentsAtRoot uint64
|
||||||
|
TotFileMergeWrittenBytes uint64
|
||||||
|
|
||||||
|
TotFileMergeZapBeg uint64
|
||||||
|
TotFileMergeZapEnd uint64
|
||||||
|
TotFileMergeZapTime uint64
|
||||||
|
MaxFileMergeZapTime uint64
|
||||||
|
|
||||||
|
TotFileMergeIntroductions uint64
|
||||||
|
TotFileMergeIntroductionsDone uint64
|
||||||
|
TotFileMergeIntroductionsSkipped uint64
|
||||||
|
|
||||||
|
TotMemMergeBeg uint64
|
||||||
|
TotMemMergeErr uint64
|
||||||
|
TotMemMergeDone uint64
|
||||||
|
TotMemMergeZapBeg uint64
|
||||||
|
TotMemMergeZapEnd uint64
|
||||||
|
TotMemMergeZapTime uint64
|
||||||
|
MaxMemMergeZapTime uint64
|
||||||
|
TotMemMergeSegments uint64
|
||||||
|
TotMemorySegmentsAtRoot uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *Stats) statsMap() (map[string]interface{}, error) {
|
// atomically populates the returned map
|
||||||
|
func (s *Stats) ToMap() map[string]interface{} {
|
||||||
m := map[string]interface{}{}
|
m := map[string]interface{}{}
|
||||||
m["updates"] = atomic.LoadUint64(&s.updates)
|
sve := reflect.ValueOf(s).Elem()
|
||||||
m["deletes"] = atomic.LoadUint64(&s.deletes)
|
svet := sve.Type()
|
||||||
m["batches"] = atomic.LoadUint64(&s.batches)
|
for i := 0; i < svet.NumField(); i++ {
|
||||||
m["errors"] = atomic.LoadUint64(&s.errors)
|
svef := sve.Field(i)
|
||||||
m["analysis_time"] = atomic.LoadUint64(&s.analysisTime)
|
if svef.CanAddr() {
|
||||||
m["index_time"] = atomic.LoadUint64(&s.indexTime)
|
svefp := svef.Addr().Interface()
|
||||||
m["term_searchers_started"] = atomic.LoadUint64(&s.termSearchersStarted)
|
m[svet.Field(i).Name] = atomic.LoadUint64(svefp.(*uint64))
|
||||||
m["term_searchers_finished"] = atomic.LoadUint64(&s.termSearchersFinished)
|
|
||||||
m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&s.numPlainTextBytesIndexed)
|
|
||||||
m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced)
|
|
||||||
m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted)
|
|
||||||
|
|
||||||
if s.i.path != "" {
|
|
||||||
finfos, err := ioutil.ReadDir(s.i.path)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
var numFilesOnDisk, numBytesUsedDisk uint64
|
|
||||||
|
|
||||||
for _, finfo := range finfos {
|
|
||||||
if !finfo.IsDir() {
|
|
||||||
numBytesUsedDisk += uint64(finfo.Size())
|
|
||||||
numFilesOnDisk++
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return m
|
||||||
m["num_bytes_used_disk"] = numBytesUsedDisk
|
|
||||||
m["num_files_on_disk"] = numFilesOnDisk
|
|
||||||
}
|
|
||||||
|
|
||||||
return m, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// MarshalJSON implements json.Marshaler
|
// MarshalJSON implements json.Marshaler, and in contrast to standard
|
||||||
|
// json marshaling provides atomic safety
|
||||||
func (s *Stats) MarshalJSON() ([]byte, error) {
|
func (s *Stats) MarshalJSON() ([]byte, error) {
|
||||||
m, err := s.statsMap()
|
return json.Marshal(s.ToMap())
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
return json.Marshal(m)
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,7 @@ package boltdb
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
|
||||||
"github.com/boltdb/bolt"
|
bolt "github.com/etcd-io/bbolt"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Iterator struct {
|
type Iterator struct {
|
||||||
|
|
|
@ -16,7 +16,7 @@ package boltdb
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"github.com/blevesearch/bleve/index/store"
|
"github.com/blevesearch/bleve/index/store"
|
||||||
"github.com/boltdb/bolt"
|
bolt "github.com/etcd-io/bbolt"
|
||||||
)
|
)
|
||||||
|
|
||||||
type Reader struct {
|
type Reader struct {
|
||||||
|
|
|
@ -30,7 +30,7 @@ import (
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index/store"
|
"github.com/blevesearch/bleve/index/store"
|
||||||
"github.com/blevesearch/bleve/registry"
|
"github.com/blevesearch/bleve/registry"
|
||||||
"github.com/boltdb/bolt"
|
bolt "github.com/etcd-io/bbolt"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
|
@ -74,6 +74,12 @@ func New(mo store.MergeOperator, config map[string]interface{}) (store.KVStore,
|
||||||
bo.ReadOnly = ro
|
bo.ReadOnly = ro
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if initialMmapSize, ok := config["initialMmapSize"].(int); ok {
|
||||||
|
bo.InitialMmapSize = initialMmapSize
|
||||||
|
} else if initialMmapSize, ok := config["initialMmapSize"].(float64); ok {
|
||||||
|
bo.InitialMmapSize = int(initialMmapSize)
|
||||||
|
}
|
||||||
|
|
||||||
db, err := bolt.Open(path, 0600, bo)
|
db, err := bolt.Open(path, 0600, bo)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
|
|
@ -15,11 +15,20 @@
|
||||||
package upsidedown
|
package upsidedown
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/document"
|
"github.com/blevesearch/bleve/document"
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/store"
|
"github.com/blevesearch/bleve/index/store"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeIndexReader int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var ir IndexReader
|
||||||
|
reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type IndexReader struct {
|
type IndexReader struct {
|
||||||
index *UpsideDownCouch
|
index *UpsideDownCouch
|
||||||
kvreader store.KVReader
|
kvreader store.KVReader
|
||||||
|
@ -201,3 +210,17 @@ func incrementBytes(in []byte) []byte {
|
||||||
}
|
}
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (i *IndexReader) DocValueReader(fields []string) (index.DocValueReader, error) {
|
||||||
|
return &DocValueReader{i: i, fields: fields}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type DocValueReader struct {
|
||||||
|
i *IndexReader
|
||||||
|
fields []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID,
|
||||||
|
visitor index.DocumentFieldTermVisitor) error {
|
||||||
|
return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor)
|
||||||
|
}
|
||||||
|
|
|
@ -16,13 +16,27 @@ package upsidedown
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/index/store"
|
"github.com/blevesearch/bleve/index/store"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeUpsideDownCouchTermFieldReader int
|
||||||
|
var reflectStaticSizeUpsideDownCouchDocIDReader int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var tfr UpsideDownCouchTermFieldReader
|
||||||
|
reflectStaticSizeUpsideDownCouchTermFieldReader =
|
||||||
|
int(reflect.TypeOf(tfr).Size())
|
||||||
|
var cdr UpsideDownCouchDocIDReader
|
||||||
|
reflectStaticSizeUpsideDownCouchDocIDReader =
|
||||||
|
int(reflect.TypeOf(cdr).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type UpsideDownCouchTermFieldReader struct {
|
type UpsideDownCouchTermFieldReader struct {
|
||||||
count uint64
|
count uint64
|
||||||
indexReader *IndexReader
|
indexReader *IndexReader
|
||||||
|
@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct {
|
||||||
includeTermVectors bool
|
includeTermVectors bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (r *UpsideDownCouchTermFieldReader) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr +
|
||||||
|
len(r.term) +
|
||||||
|
r.tfrPrealloc.Size() +
|
||||||
|
len(r.keyBuf)
|
||||||
|
|
||||||
|
if r.tfrNext != nil {
|
||||||
|
sizeInBytes += r.tfrNext.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) {
|
func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) {
|
||||||
bufNeeded := termFrequencyRowKeySize(term, nil)
|
bufNeeded := termFrequencyRowKeySize(term, nil)
|
||||||
if bufNeeded < dictionaryRowKeySize(term) {
|
if bufNeeded < dictionaryRowKeySize(term) {
|
||||||
|
@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct {
|
||||||
onlyMode bool
|
onlyMode bool
|
||||||
}
|
}
|
||||||
|
|
||||||
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
|
func (r *UpsideDownCouchDocIDReader) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader +
|
||||||
|
reflectStaticSizeIndexReader + size.SizeOfPtr
|
||||||
|
|
||||||
|
for _, entry := range r.only {
|
||||||
|
sizeInBytes += size.SizeOfString + len(entry)
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
|
func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) {
|
||||||
startBytes := []byte{0x0}
|
startBytes := []byte{0x0}
|
||||||
endBytes := []byte{0xff}
|
endBytes := []byte{0xff}
|
||||||
|
|
||||||
|
|
|
@ -20,10 +20,22 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"math"
|
"math"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
"github.com/golang/protobuf/proto"
|
"github.com/golang/protobuf/proto"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeTermFrequencyRow int
|
||||||
|
var reflectStaticSizeTermVector int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var tfr TermFrequencyRow
|
||||||
|
reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size())
|
||||||
|
var tv TermVector
|
||||||
|
reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size())
|
||||||
|
}
|
||||||
|
|
||||||
const ByteSeparator byte = 0xff
|
const ByteSeparator byte = 0xff
|
||||||
|
|
||||||
type UpsideDownCouchRowStream chan UpsideDownCouchRow
|
type UpsideDownCouchRowStream chan UpsideDownCouchRow
|
||||||
|
@ -358,6 +370,11 @@ type TermVector struct {
|
||||||
end uint64
|
end uint64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (tv *TermVector) Size() int {
|
||||||
|
return reflectStaticSizeTermVector + size.SizeOfPtr +
|
||||||
|
len(tv.arrayPositions)*size.SizeOfUint64
|
||||||
|
}
|
||||||
|
|
||||||
func (tv *TermVector) String() string {
|
func (tv *TermVector) String() string {
|
||||||
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
|
return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions)
|
||||||
}
|
}
|
||||||
|
@ -371,6 +388,18 @@ type TermFrequencyRow struct {
|
||||||
field uint16
|
field uint16
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (tfr *TermFrequencyRow) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeTermFrequencyRow +
|
||||||
|
len(tfr.term) +
|
||||||
|
len(tfr.doc)
|
||||||
|
|
||||||
|
for _, entry := range tfr.vectors {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (tfr *TermFrequencyRow) Term() []byte {
|
func (tfr *TermFrequencyRow) Term() []byte {
|
||||||
return tfr.term
|
return tfr.term
|
||||||
}
|
}
|
||||||
|
@ -555,7 +584,7 @@ func (tfr *TermFrequencyRow) parseK(key []byte) error {
|
||||||
|
|
||||||
func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error {
|
func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error {
|
||||||
tfr.doc = key[3+len(term)+1:]
|
tfr.doc = key[3+len(term)+1:]
|
||||||
if len(tfr.doc) <= 0 {
|
if len(tfr.doc) == 0 {
|
||||||
return fmt.Errorf("invalid term frequency key, empty docid")
|
return fmt.Errorf("invalid term frequency key, empty docid")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -775,7 +775,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis.
|
||||||
}
|
}
|
||||||
|
|
||||||
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
|
func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector {
|
||||||
if len(in) <= 0 {
|
if len(in) == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -810,6 +810,7 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(batch.IndexOps) > 0 {
|
||||||
go func() {
|
go func() {
|
||||||
for _, doc := range batch.IndexOps {
|
for _, doc := range batch.IndexOps {
|
||||||
if doc != nil {
|
if doc != nil {
|
||||||
|
@ -819,6 +820,7 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
}
|
||||||
|
|
||||||
// retrieve back index rows concurrent with analysis
|
// retrieve back index rows concurrent with analysis
|
||||||
docBackIndexRowErr := error(nil)
|
docBackIndexRowErr := error(nil)
|
||||||
|
@ -958,6 +960,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) {
|
||||||
} else {
|
} else {
|
||||||
atomic.AddUint64(&udc.stats.errors, 1)
|
atomic.AddUint64(&udc.stats.errors, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
persistedCallback := batch.PersistedCallback()
|
||||||
|
if persistedCallback != nil {
|
||||||
|
persistedCallback(err)
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -433,6 +433,7 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest {
|
||||||
Explain: req.Explain,
|
Explain: req.Explain,
|
||||||
Sort: req.Sort.Copy(),
|
Sort: req.Sort.Copy(),
|
||||||
IncludeLocations: req.IncludeLocations,
|
IncludeLocations: req.IncludeLocations,
|
||||||
|
Score: req.Score,
|
||||||
}
|
}
|
||||||
return &rv
|
return &rv
|
||||||
}
|
}
|
||||||
|
|
|
@ -50,6 +50,12 @@ const storePath = "store"
|
||||||
|
|
||||||
var mappingInternalKey = []byte("_mapping")
|
var mappingInternalKey = []byte("_mapping")
|
||||||
|
|
||||||
|
const SearchQueryStartCallbackKey = "_search_query_start_callback_key"
|
||||||
|
const SearchQueryEndCallbackKey = "_search_query_end_callback_key"
|
||||||
|
|
||||||
|
type SearchQueryStartCallbackFn func(size uint64) error
|
||||||
|
type SearchQueryEndCallbackFn func(size uint64) error
|
||||||
|
|
||||||
func indexStorePath(path string) string {
|
func indexStorePath(path string) string {
|
||||||
return path + string(os.PathSeparator) + storePath
|
return path + string(os.PathSeparator) + storePath
|
||||||
}
|
}
|
||||||
|
@ -362,6 +368,68 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) {
|
||||||
return i.SearchInContext(context.Background(), req)
|
return i.SearchInContext(context.Background(), req)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var documentMatchEmptySize int
|
||||||
|
var searchContextEmptySize int
|
||||||
|
var facetResultEmptySize int
|
||||||
|
var documentEmptySize int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var dm search.DocumentMatch
|
||||||
|
documentMatchEmptySize = dm.Size()
|
||||||
|
|
||||||
|
var sc search.SearchContext
|
||||||
|
searchContextEmptySize = sc.Size()
|
||||||
|
|
||||||
|
var fr search.FacetResult
|
||||||
|
facetResultEmptySize = fr.Size()
|
||||||
|
|
||||||
|
var d document.Document
|
||||||
|
documentEmptySize = d.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
// memNeededForSearch is a helper function that returns an estimate of RAM
|
||||||
|
// needed to execute a search request.
|
||||||
|
func memNeededForSearch(req *SearchRequest,
|
||||||
|
searcher search.Searcher,
|
||||||
|
topnCollector *collector.TopNCollector) uint64 {
|
||||||
|
|
||||||
|
backingSize := req.Size + req.From + 1
|
||||||
|
if req.Size+req.From > collector.PreAllocSizeSkipCap {
|
||||||
|
backingSize = collector.PreAllocSizeSkipCap + 1
|
||||||
|
}
|
||||||
|
numDocMatches := backingSize + searcher.DocumentMatchPoolSize()
|
||||||
|
|
||||||
|
estimate := 0
|
||||||
|
|
||||||
|
// overhead, size in bytes from collector
|
||||||
|
estimate += topnCollector.Size()
|
||||||
|
|
||||||
|
// pre-allocing DocumentMatchPool
|
||||||
|
estimate += searchContextEmptySize + numDocMatches*documentMatchEmptySize
|
||||||
|
|
||||||
|
// searcher overhead
|
||||||
|
estimate += searcher.Size()
|
||||||
|
|
||||||
|
// overhead from results, lowestMatchOutsideResults
|
||||||
|
estimate += (numDocMatches + 1) * documentMatchEmptySize
|
||||||
|
|
||||||
|
// additional overhead from SearchResult
|
||||||
|
estimate += reflectStaticSizeSearchResult + reflectStaticSizeSearchStatus
|
||||||
|
|
||||||
|
// overhead from facet results
|
||||||
|
if req.Facets != nil {
|
||||||
|
estimate += len(req.Facets) * facetResultEmptySize
|
||||||
|
}
|
||||||
|
|
||||||
|
// highlighting, store
|
||||||
|
if len(req.Fields) > 0 || req.Highlight != nil {
|
||||||
|
// Size + From => number of hits
|
||||||
|
estimate += (req.Size + req.From) * documentEmptySize
|
||||||
|
}
|
||||||
|
|
||||||
|
return uint64(estimate)
|
||||||
|
}
|
||||||
|
|
||||||
// SearchInContext executes a search request operation within the provided
|
// SearchInContext executes a search request operation within the provided
|
||||||
// Context. Returns a SearchResult object or an error.
|
// Context. Returns a SearchResult object or an error.
|
||||||
func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) {
|
func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) {
|
||||||
|
@ -390,6 +458,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
|
||||||
searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{
|
searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{
|
||||||
Explain: req.Explain,
|
Explain: req.Explain,
|
||||||
IncludeTermVectors: req.IncludeLocations || req.Highlight != nil,
|
IncludeTermVectors: req.IncludeLocations || req.Highlight != nil,
|
||||||
|
Score: req.Score,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -428,6 +497,24 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
|
||||||
collector.SetFacetsBuilder(facetsBuilder)
|
collector.SetFacetsBuilder(facetsBuilder)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
memNeeded := memNeededForSearch(req, searcher, collector)
|
||||||
|
if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil {
|
||||||
|
if cbF, ok := cb.(SearchQueryStartCallbackFn); ok {
|
||||||
|
err = cbF(memNeeded)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if cb := ctx.Value(SearchQueryEndCallbackKey); cb != nil {
|
||||||
|
if cbF, ok := cb.(SearchQueryEndCallbackFn); ok {
|
||||||
|
defer func() {
|
||||||
|
_ = cbF(memNeeded)
|
||||||
|
}()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
err = collector.Collect(ctx, searcher, indexReader)
|
err = collector.Collect(ctx, searcher, indexReader)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -459,7 +546,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
|
||||||
doc, err := indexReader.Document(hit.ID)
|
doc, err := indexReader.Document(hit.ID)
|
||||||
if err == nil && doc != nil {
|
if err == nil && doc != nil {
|
||||||
if len(req.Fields) > 0 {
|
if len(req.Fields) > 0 {
|
||||||
for _, f := range req.Fields {
|
fieldsToLoad := deDuplicate(req.Fields)
|
||||||
|
for _, f := range fieldsToLoad {
|
||||||
for _, docF := range doc.Fields {
|
for _, docF := range doc.Fields {
|
||||||
if f == "*" || docF.Name() == f {
|
if f == "*" || docF.Name() == f {
|
||||||
var value interface{}
|
var value interface{}
|
||||||
|
@ -533,9 +621,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr
|
||||||
return &SearchResult{
|
return &SearchResult{
|
||||||
Status: &SearchStatus{
|
Status: &SearchStatus{
|
||||||
Total: 1,
|
Total: 1,
|
||||||
Failed: 0,
|
|
||||||
Successful: 1,
|
Successful: 1,
|
||||||
Errors: make(map[string]error),
|
|
||||||
},
|
},
|
||||||
Request: req,
|
Request: req,
|
||||||
Hits: hits,
|
Hits: hits,
|
||||||
|
@ -755,3 +841,16 @@ func (f *indexImplFieldDict) Close() error {
|
||||||
}
|
}
|
||||||
return f.indexReader.Close()
|
return f.indexReader.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// helper function to remove duplicate entries from slice of strings
|
||||||
|
func deDuplicate(fields []string) []string {
|
||||||
|
entries := make(map[string]struct{})
|
||||||
|
ret := []string{}
|
||||||
|
for _, entry := range fields {
|
||||||
|
if _, exists := entries[entry]; !exists {
|
||||||
|
entries[entry] = struct{}{}
|
||||||
|
ret = append(ret, entry)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ret
|
||||||
|
}
|
||||||
|
|
|
@ -18,6 +18,7 @@ import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"os"
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index/upsidedown"
|
"github.com/blevesearch/bleve/index/upsidedown"
|
||||||
)
|
)
|
||||||
|
@ -92,5 +93,5 @@ func (i *indexMeta) Save(path string) (err error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func indexMetaPath(path string) string {
|
func indexMetaPath(path string) string {
|
||||||
return path + string(os.PathSeparator) + metaFilename
|
return filepath.Join(path, metaFilename)
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,7 +42,7 @@ type DocumentMapping struct {
|
||||||
Dynamic bool `json:"dynamic"`
|
Dynamic bool `json:"dynamic"`
|
||||||
Properties map[string]*DocumentMapping `json:"properties,omitempty"`
|
Properties map[string]*DocumentMapping `json:"properties,omitempty"`
|
||||||
Fields []*FieldMapping `json:"fields,omitempty"`
|
Fields []*FieldMapping `json:"fields,omitempty"`
|
||||||
DefaultAnalyzer string `json:"default_analyzer"`
|
DefaultAnalyzer string `json:"default_analyzer,omitempty"`
|
||||||
|
|
||||||
// StructTagKey overrides "json" when looking for field names in struct tags
|
// StructTagKey overrides "json" when looking for field names in struct tags
|
||||||
StructTagKey string `json:"struct_tag_key,omitempty"`
|
StructTagKey string `json:"struct_tag_key,omitempty"`
|
||||||
|
@ -324,13 +324,17 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) {
|
func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) {
|
||||||
// allow default "json" tag to be overriden
|
// allow default "json" tag to be overridden
|
||||||
structTagKey := dm.StructTagKey
|
structTagKey := dm.StructTagKey
|
||||||
if structTagKey == "" {
|
if structTagKey == "" {
|
||||||
structTagKey = "json"
|
structTagKey = "json"
|
||||||
}
|
}
|
||||||
|
|
||||||
val := reflect.ValueOf(data)
|
val := reflect.ValueOf(data)
|
||||||
|
if !val.IsValid() {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
typ := val.Type()
|
typ := val.Type()
|
||||||
switch typ.Kind() {
|
switch typ.Kind() {
|
||||||
case reflect.Map:
|
case reflect.Map:
|
||||||
|
@ -420,8 +424,12 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string,
|
||||||
if subDocMapping != nil {
|
if subDocMapping != nil {
|
||||||
// index by explicit mapping
|
// index by explicit mapping
|
||||||
for _, fieldMapping := range subDocMapping.Fields {
|
for _, fieldMapping := range subDocMapping.Fields {
|
||||||
|
if fieldMapping.Type == "geopoint" {
|
||||||
|
fieldMapping.processGeoPoint(property, pathString, path, indexes, context)
|
||||||
|
} else {
|
||||||
fieldMapping.processString(propertyValueString, pathString, path, indexes, context)
|
fieldMapping.processString(propertyValueString, pathString, path, indexes, context)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else if closestDocMapping.Dynamic {
|
} else if closestDocMapping.Dynamic {
|
||||||
// automatic indexing behavior
|
// automatic indexing behavior
|
||||||
|
|
||||||
|
|
|
@ -320,8 +320,8 @@ func (im *IndexMappingImpl) determineType(data interface{}) string {
|
||||||
func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error {
|
func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error {
|
||||||
docType := im.determineType(data)
|
docType := im.determineType(data)
|
||||||
docMapping := im.mappingForType(docType)
|
docMapping := im.mappingForType(docType)
|
||||||
walkContext := im.newWalkContext(doc, docMapping)
|
|
||||||
if docMapping.Enabled {
|
if docMapping.Enabled {
|
||||||
|
walkContext := im.newWalkContext(doc, docMapping)
|
||||||
docMapping.walkDocument(data, []string{}, []uint64{}, walkContext)
|
docMapping.walkDocument(data, []string{}, []uint64{}, walkContext)
|
||||||
|
|
||||||
// see if the _all field was disabled
|
// see if the _all field was disabled
|
||||||
|
|
|
@ -35,6 +35,9 @@ func lookupPropertyPath(data interface{}, path string) interface{} {
|
||||||
|
|
||||||
func lookupPropertyPathPart(data interface{}, part string) interface{} {
|
func lookupPropertyPathPart(data interface{}, part string) interface{} {
|
||||||
val := reflect.ValueOf(data)
|
val := reflect.ValueOf(data)
|
||||||
|
if !val.IsValid() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
typ := val.Type()
|
typ := val.Type()
|
||||||
switch typ.Kind() {
|
switch typ.Kind() {
|
||||||
case reflect.Map:
|
case reflect.Map:
|
||||||
|
|
|
@ -14,7 +14,7 @@ var interleaveShift = []uint{1, 2, 4, 8, 16}
|
||||||
|
|
||||||
// Interleave the first 32 bits of each uint64
|
// Interleave the first 32 bits of each uint64
|
||||||
// apdated from org.apache.lucene.util.BitUtil
|
// apdated from org.apache.lucene.util.BitUtil
|
||||||
// whcih was adapted from:
|
// which was adapted from:
|
||||||
// http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN
|
// http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN
|
||||||
func Interleave(v1, v2 uint64) uint64 {
|
func Interleave(v1, v2 uint64) uint64 {
|
||||||
v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4]
|
v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4]
|
||||||
|
|
|
@ -77,6 +77,10 @@ func (p PrefixCoded) Int64() (int64, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func ValidPrefixCodedTerm(p string) (bool, int) {
|
func ValidPrefixCodedTerm(p string) (bool, int) {
|
||||||
|
return ValidPrefixCodedTermBytes([]byte(p))
|
||||||
|
}
|
||||||
|
|
||||||
|
func ValidPrefixCodedTermBytes(p []byte) (bool, int) {
|
||||||
if len(p) > 0 {
|
if len(p) > 0 {
|
||||||
if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 {
|
if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 {
|
||||||
return false, 0
|
return false, 0
|
||||||
|
|
|
@ -17,15 +17,29 @@ package bleve
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/analysis"
|
"github.com/blevesearch/bleve/analysis"
|
||||||
"github.com/blevesearch/bleve/analysis/datetime/optional"
|
"github.com/blevesearch/bleve/analysis/datetime/optional"
|
||||||
|
"github.com/blevesearch/bleve/document"
|
||||||
"github.com/blevesearch/bleve/registry"
|
"github.com/blevesearch/bleve/registry"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/search/collector"
|
||||||
"github.com/blevesearch/bleve/search/query"
|
"github.com/blevesearch/bleve/search/query"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeSearchResult int
|
||||||
|
var reflectStaticSizeSearchStatus int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var sr SearchResult
|
||||||
|
reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size())
|
||||||
|
var ss SearchStatus
|
||||||
|
reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size())
|
||||||
|
}
|
||||||
|
|
||||||
var cache = registry.NewCache()
|
var cache = registry.NewCache()
|
||||||
|
|
||||||
const defaultDateTimeParser = optional.Name
|
const defaultDateTimeParser = optional.Name
|
||||||
|
@ -247,6 +261,7 @@ func (h *HighlightRequest) AddField(field string) {
|
||||||
// Explain triggers inclusion of additional search
|
// Explain triggers inclusion of additional search
|
||||||
// result score explanations.
|
// result score explanations.
|
||||||
// Sort describes the desired order for the results to be returned.
|
// Sort describes the desired order for the results to be returned.
|
||||||
|
// Score controls the kind of scoring performed
|
||||||
//
|
//
|
||||||
// A special field named "*" can be used to return all fields.
|
// A special field named "*" can be used to return all fields.
|
||||||
type SearchRequest struct {
|
type SearchRequest struct {
|
||||||
|
@ -259,6 +274,7 @@ type SearchRequest struct {
|
||||||
Explain bool `json:"explain"`
|
Explain bool `json:"explain"`
|
||||||
Sort search.SortOrder `json:"sort"`
|
Sort search.SortOrder `json:"sort"`
|
||||||
IncludeLocations bool `json:"includeLocations"`
|
IncludeLocations bool `json:"includeLocations"`
|
||||||
|
Score string `json:"score,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *SearchRequest) Validate() error {
|
func (r *SearchRequest) Validate() error {
|
||||||
|
@ -308,6 +324,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error {
|
||||||
Explain bool `json:"explain"`
|
Explain bool `json:"explain"`
|
||||||
Sort []json.RawMessage `json:"sort"`
|
Sort []json.RawMessage `json:"sort"`
|
||||||
IncludeLocations bool `json:"includeLocations"`
|
IncludeLocations bool `json:"includeLocations"`
|
||||||
|
Score string `json:"score"`
|
||||||
}
|
}
|
||||||
|
|
||||||
err := json.Unmarshal(input, &temp)
|
err := json.Unmarshal(input, &temp)
|
||||||
|
@ -334,6 +351,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error {
|
||||||
r.Fields = temp.Fields
|
r.Fields = temp.Fields
|
||||||
r.Facets = temp.Facets
|
r.Facets = temp.Facets
|
||||||
r.IncludeLocations = temp.IncludeLocations
|
r.IncludeLocations = temp.IncludeLocations
|
||||||
|
r.Score = temp.Score
|
||||||
r.Query, err = query.ParseQuery(temp.Q)
|
r.Query, err = query.ParseQuery(temp.Q)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -432,6 +450,24 @@ type SearchResult struct {
|
||||||
Facets search.FacetResults `json:"facets"`
|
Facets search.FacetResults `json:"facets"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (sr *SearchResult) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeSearchResult + size.SizeOfPtr +
|
||||||
|
reflectStaticSizeSearchStatus
|
||||||
|
|
||||||
|
for _, entry := range sr.Hits {
|
||||||
|
if entry != nil {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v := range sr.Facets {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) +
|
||||||
|
v.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (sr *SearchResult) String() string {
|
func (sr *SearchResult) String() string {
|
||||||
rv := ""
|
rv := ""
|
||||||
if sr.Total > 0 {
|
if sr.Total > 0 {
|
||||||
|
@ -488,3 +524,44 @@ func (sr *SearchResult) Merge(other *SearchResult) {
|
||||||
|
|
||||||
sr.Facets.Merge(other.Facets)
|
sr.Facets.Merge(other.Facets)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// MemoryNeededForSearchResult is an exported helper function to determine the RAM
|
||||||
|
// needed to accommodate the results for a given search request.
|
||||||
|
func MemoryNeededForSearchResult(req *SearchRequest) uint64 {
|
||||||
|
if req == nil {
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
numDocMatches := req.Size + req.From
|
||||||
|
if req.Size+req.From > collector.PreAllocSizeSkipCap {
|
||||||
|
numDocMatches = collector.PreAllocSizeSkipCap
|
||||||
|
}
|
||||||
|
|
||||||
|
estimate := 0
|
||||||
|
|
||||||
|
// overhead from the SearchResult structure
|
||||||
|
var sr SearchResult
|
||||||
|
estimate += sr.Size()
|
||||||
|
|
||||||
|
var dm search.DocumentMatch
|
||||||
|
sizeOfDocumentMatch := dm.Size()
|
||||||
|
|
||||||
|
// overhead from results
|
||||||
|
estimate += numDocMatches * sizeOfDocumentMatch
|
||||||
|
|
||||||
|
// overhead from facet results
|
||||||
|
if req.Facets != nil {
|
||||||
|
var fr search.FacetResult
|
||||||
|
estimate += len(req.Facets) * fr.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
// highlighting, store
|
||||||
|
var d document.Document
|
||||||
|
if len(req.Fields) > 0 || req.Highlight != nil {
|
||||||
|
for i := 0; i < (req.Size + req.From); i++ {
|
||||||
|
estimate += (req.Size + req.From) * d.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return uint64(estimate)
|
||||||
|
}
|
||||||
|
|
|
@ -30,3 +30,23 @@ type Collector interface {
|
||||||
SetFacetsBuilder(facetsBuilder *FacetsBuilder)
|
SetFacetsBuilder(facetsBuilder *FacetsBuilder)
|
||||||
FacetResults() FacetResults
|
FacetResults() FacetResults
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// DocumentMatchHandler is the type of document match callback
|
||||||
|
// bleve will invoke during the search.
|
||||||
|
// Eventually, bleve will indicate the completion of an ongoing search,
|
||||||
|
// by passing a nil value for the document match callback.
|
||||||
|
// The application should take a copy of the hit/documentMatch
|
||||||
|
// if it wish to own it or need prolonged access to it.
|
||||||
|
type DocumentMatchHandler func(hit *DocumentMatch) error
|
||||||
|
|
||||||
|
type MakeDocumentMatchHandlerKeyType string
|
||||||
|
|
||||||
|
var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType(
|
||||||
|
"MakeDocumentMatchHandlerKey")
|
||||||
|
|
||||||
|
// MakeDocumentMatchHandler is an optional DocumentMatchHandler
|
||||||
|
// builder function which the applications can pass to bleve.
|
||||||
|
// These builder methods gives a DocumentMatchHandler function
|
||||||
|
// to bleve, which it will invoke on every document matches.
|
||||||
|
type MakeDocumentMatchHandler func(ctx *SearchContext) (
|
||||||
|
callback DocumentMatchHandler, loadID bool, err error)
|
||||||
|
|
|
@ -25,9 +25,9 @@ type collectStoreHeap struct {
|
||||||
compare collectorCompare
|
compare collectorCompare
|
||||||
}
|
}
|
||||||
|
|
||||||
func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap {
|
func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap {
|
||||||
rv := &collectStoreHeap{
|
rv := &collectStoreHeap{
|
||||||
heap: make(search.DocumentMatchCollection, 0, cap),
|
heap: make(search.DocumentMatchCollection, 0, capacity),
|
||||||
compare: compare,
|
compare: compare,
|
||||||
}
|
}
|
||||||
heap.Init(rv)
|
heap.Init(rv)
|
||||||
|
|
|
@ -25,7 +25,7 @@ type collectStoreList struct {
|
||||||
compare collectorCompare
|
compare collectorCompare
|
||||||
}
|
}
|
||||||
|
|
||||||
func newStoreList(cap int, compare collectorCompare) *collectStoreList {
|
func newStoreList(capacity int, compare collectorCompare) *collectStoreList {
|
||||||
rv := &collectStoreList{
|
rv := &collectStoreList{
|
||||||
results: list.New(),
|
results: list.New(),
|
||||||
compare: compare,
|
compare: compare,
|
||||||
|
@ -34,8 +34,7 @@ func newStoreList(cap int, compare collectorCompare) *collectStoreList {
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch,
|
func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch {
|
||||||
size int) *search.DocumentMatch {
|
|
||||||
c.add(doc)
|
c.add(doc)
|
||||||
if c.len() > size {
|
if c.len() > size {
|
||||||
return c.removeLast()
|
return c.removeLast()
|
||||||
|
|
|
@ -21,9 +21,9 @@ type collectStoreSlice struct {
|
||||||
compare collectorCompare
|
compare collectorCompare
|
||||||
}
|
}
|
||||||
|
|
||||||
func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice {
|
func newStoreSlice(capacity int, compare collectorCompare) *collectStoreSlice {
|
||||||
rv := &collectStoreSlice{
|
rv := &collectStoreSlice{
|
||||||
slice: make(search.DocumentMatchCollection, 0, cap),
|
slice: make(search.DocumentMatchCollection, 0, capacity),
|
||||||
compare: compare,
|
compare: compare,
|
||||||
}
|
}
|
||||||
return rv
|
return rv
|
||||||
|
|
|
@ -16,12 +16,21 @@ package collector
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"reflect"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeTopNCollector int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var coll TopNCollector
|
||||||
|
reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type collectorStore interface {
|
type collectorStore interface {
|
||||||
// Add the document, and if the new store size exceeds the provided size
|
// Add the document, and if the new store size exceeds the provided size
|
||||||
// the last element is removed and returned. If the size has not been
|
// the last element is removed and returned. If the size has not been
|
||||||
|
@ -58,6 +67,8 @@ type TopNCollector struct {
|
||||||
cachedDesc []bool
|
cachedDesc []bool
|
||||||
|
|
||||||
lowestMatchOutsideResults *search.DocumentMatch
|
lowestMatchOutsideResults *search.DocumentMatch
|
||||||
|
updateFieldVisitor index.DocumentFieldTermVisitor
|
||||||
|
dvReader index.DocValueReader
|
||||||
}
|
}
|
||||||
|
|
||||||
// CheckDoneEvery controls how frequently we check the context deadline
|
// CheckDoneEvery controls how frequently we check the context deadline
|
||||||
|
@ -98,6 +109,22 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector
|
||||||
return hc
|
return hc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (hc *TopNCollector) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr
|
||||||
|
|
||||||
|
if hc.facetsBuilder != nil {
|
||||||
|
sizeInBytes += hc.facetsBuilder.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range hc.neededFields {
|
||||||
|
sizeInBytes += len(entry) + size.SizeOfString
|
||||||
|
}
|
||||||
|
|
||||||
|
sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc)
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
// Collect goes to the index to find the matching documents
|
// Collect goes to the index to find the matching documents
|
||||||
func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
|
func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error {
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
|
@ -113,8 +140,34 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
|
||||||
}
|
}
|
||||||
searchContext := &search.SearchContext{
|
searchContext := &search.SearchContext{
|
||||||
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
|
DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)),
|
||||||
|
Collector: hc,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
hc.dvReader, err = reader.DocValueReader(hc.neededFields)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
hc.updateFieldVisitor = func(field string, term []byte) {
|
||||||
|
if hc.facetsBuilder != nil {
|
||||||
|
hc.facetsBuilder.UpdateVisitor(field, term)
|
||||||
|
}
|
||||||
|
hc.sort.UpdateVisitor(field, term)
|
||||||
|
}
|
||||||
|
|
||||||
|
dmHandlerMaker := MakeTopNDocumentMatchHandler
|
||||||
|
if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil {
|
||||||
|
dmHandlerMaker = cv.(search.MakeDocumentMatchHandler)
|
||||||
|
}
|
||||||
|
// use the application given builder for making the custom document match
|
||||||
|
// handler and perform callbacks/invocations on the newly made handler.
|
||||||
|
dmHandler, loadID, err := dmHandlerMaker(searchContext)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
hc.needDocIds = hc.needDocIds || loadID
|
||||||
|
|
||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return ctx.Err()
|
return ctx.Err()
|
||||||
|
@ -130,13 +183,26 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
err = hc.collectSingle(searchContext, reader, next)
|
err = hc.prepareDocumentMatch(searchContext, reader, next)
|
||||||
|
if err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
err = dmHandler(next)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
next, err = searcher.Next(searchContext)
|
next, err = searcher.Next(searchContext)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// help finalize/flush the results in case
|
||||||
|
// of custom document match handlers.
|
||||||
|
err = dmHandler(nil)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
// compute search duration
|
// compute search duration
|
||||||
hc.took = time.Since(startTime)
|
hc.took = time.Since(startTime)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -152,8 +218,8 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher,
|
||||||
|
|
||||||
var sortByScoreOpt = []string{"_score"}
|
var sortByScoreOpt = []string{"_score"}
|
||||||
|
|
||||||
func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error {
|
func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext,
|
||||||
var err error
|
reader index.IndexReader, d *search.DocumentMatch) (err error) {
|
||||||
|
|
||||||
// visit field terms for features that require it (sort, facets)
|
// visit field terms for features that require it (sort, facets)
|
||||||
if len(hc.neededFields) > 0 {
|
if len(hc.neededFields) > 0 {
|
||||||
|
@ -187,11 +253,24 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
|
||||||
hc.sort.Value(d)
|
hc.sort.Value(d)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func MakeTopNDocumentMatchHandler(
|
||||||
|
ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) {
|
||||||
|
var hc *TopNCollector
|
||||||
|
var ok bool
|
||||||
|
if hc, ok = ctx.Collector.(*TopNCollector); ok {
|
||||||
|
return func(d *search.DocumentMatch) error {
|
||||||
|
if d == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
// optimization, we track lowest sorting hit already removed from heap
|
// optimization, we track lowest sorting hit already removed from heap
|
||||||
// with this one comparison, we can avoid all heap operations if
|
// with this one comparison, we can avoid all heap operations if
|
||||||
// this hit would have been added and then immediately removed
|
// this hit would have been added and then immediately removed
|
||||||
if hc.lowestMatchOutsideResults != nil {
|
if hc.lowestMatchOutsideResults != nil {
|
||||||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults)
|
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d,
|
||||||
|
hc.lowestMatchOutsideResults)
|
||||||
if cmp >= 0 {
|
if cmp >= 0 {
|
||||||
// this hit can't possibly be in the result set, so avoid heap ops
|
// this hit can't possibly be in the result set, so avoid heap ops
|
||||||
ctx.DocumentMatchPool.Put(d)
|
ctx.DocumentMatchPool.Put(d)
|
||||||
|
@ -204,7 +283,8 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
|
||||||
if hc.lowestMatchOutsideResults == nil {
|
if hc.lowestMatchOutsideResults == nil {
|
||||||
hc.lowestMatchOutsideResults = removed
|
hc.lowestMatchOutsideResults = removed
|
||||||
} else {
|
} else {
|
||||||
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults)
|
cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc,
|
||||||
|
removed, hc.lowestMatchOutsideResults)
|
||||||
if cmp < 0 {
|
if cmp < 0 {
|
||||||
tmp := hc.lowestMatchOutsideResults
|
tmp := hc.lowestMatchOutsideResults
|
||||||
hc.lowestMatchOutsideResults = removed
|
hc.lowestMatchOutsideResults = removed
|
||||||
|
@ -212,8 +292,10 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
|
}, false, nil
|
||||||
|
}
|
||||||
|
return nil, false, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// visitFieldTerms is responsible for visiting the field terms of the
|
// visitFieldTerms is responsible for visiting the field terms of the
|
||||||
|
@ -223,13 +305,7 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc
|
||||||
hc.facetsBuilder.StartDoc()
|
hc.facetsBuilder.StartDoc()
|
||||||
}
|
}
|
||||||
|
|
||||||
err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) {
|
err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor)
|
||||||
if hc.facetsBuilder != nil {
|
|
||||||
hc.facetsBuilder.UpdateVisitor(field, term)
|
|
||||||
}
|
|
||||||
hc.sort.UpdateVisitor(field, term)
|
|
||||||
})
|
|
||||||
|
|
||||||
if hc.facetsBuilder != nil {
|
if hc.facetsBuilder != nil {
|
||||||
hc.facetsBuilder.EndDoc()
|
hc.facetsBuilder.EndDoc()
|
||||||
}
|
}
|
||||||
|
@ -257,6 +333,7 @@ func (hc *TopNCollector) finalizeResults(r index.IndexReader) error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
doc.Complete(nil)
|
||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@ -288,5 +365,5 @@ func (hc *TopNCollector) FacetResults() search.FacetResults {
|
||||||
if hc.facetsBuilder != nil {
|
if hc.facetsBuilder != nil {
|
||||||
return hc.facetsBuilder.Results()
|
return hc.facetsBuilder.Results()
|
||||||
}
|
}
|
||||||
return search.FacetResults{}
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,8 +17,18 @@ package search
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeExplanation int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var e Explanation
|
||||||
|
reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type Explanation struct {
|
type Explanation struct {
|
||||||
Value float64 `json:"value"`
|
Value float64 `json:"value"`
|
||||||
Message string `json:"message"`
|
Message string `json:"message"`
|
||||||
|
@ -32,3 +42,14 @@ func (expl *Explanation) String() string {
|
||||||
}
|
}
|
||||||
return string(js)
|
return string(js)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (expl *Explanation) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeExplanation + size.SizeOfPtr +
|
||||||
|
len(expl.Message)
|
||||||
|
|
||||||
|
for _, entry := range expl.Children {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
|
@ -15,13 +15,25 @@
|
||||||
package facet
|
package facet
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/numeric"
|
"github.com/blevesearch/bleve/numeric"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeDateTimeFacetBuilder int
|
||||||
|
var reflectStaticSizedateTimeRange int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var dtfb DateTimeFacetBuilder
|
||||||
|
reflectStaticSizeDateTimeFacetBuilder = int(reflect.TypeOf(dtfb).Size())
|
||||||
|
var dtr dateTimeRange
|
||||||
|
reflectStaticSizedateTimeRange = int(reflect.TypeOf(dtr).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type dateTimeRange struct {
|
type dateTimeRange struct {
|
||||||
start time.Time
|
start time.Time
|
||||||
end time.Time
|
end time.Time
|
||||||
|
@ -46,6 +58,23 @@ func NewDateTimeFacetBuilder(field string, size int) *DateTimeFacetBuilder {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (fb *DateTimeFacetBuilder) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr +
|
||||||
|
len(fb.field)
|
||||||
|
|
||||||
|
for k, _ := range fb.termsCount {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) +
|
||||||
|
size.SizeOfInt
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, _ := range fb.ranges {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) +
|
||||||
|
size.SizeOfPtr + reflectStaticSizedateTimeRange
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) {
|
func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) {
|
||||||
r := dateTimeRange{
|
r := dateTimeRange{
|
||||||
start: start,
|
start: start,
|
||||||
|
|
|
@ -15,12 +15,24 @@
|
||||||
package facet
|
package facet
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/numeric"
|
"github.com/blevesearch/bleve/numeric"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeNumericFacetBuilder int
|
||||||
|
var reflectStaticSizenumericRange int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var nfb NumericFacetBuilder
|
||||||
|
reflectStaticSizeNumericFacetBuilder = int(reflect.TypeOf(nfb).Size())
|
||||||
|
var nr numericRange
|
||||||
|
reflectStaticSizenumericRange = int(reflect.TypeOf(nr).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type numericRange struct {
|
type numericRange struct {
|
||||||
min *float64
|
min *float64
|
||||||
max *float64
|
max *float64
|
||||||
|
@ -45,6 +57,23 @@ func NewNumericFacetBuilder(field string, size int) *NumericFacetBuilder {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (fb *NumericFacetBuilder) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr +
|
||||||
|
len(fb.field)
|
||||||
|
|
||||||
|
for k, _ := range fb.termsCount {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) +
|
||||||
|
size.SizeOfInt
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, _ := range fb.ranges {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) +
|
||||||
|
size.SizeOfPtr + reflectStaticSizenumericRange
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) {
|
func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) {
|
||||||
r := numericRange{
|
r := numericRange{
|
||||||
min: min,
|
min: min,
|
||||||
|
|
|
@ -15,11 +15,20 @@
|
||||||
package facet
|
package facet
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeTermsFacetBuilder int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var tfb TermsFacetBuilder
|
||||||
|
reflectStaticSizeTermsFacetBuilder = int(reflect.TypeOf(tfb).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type TermsFacetBuilder struct {
|
type TermsFacetBuilder struct {
|
||||||
size int
|
size int
|
||||||
field string
|
field string
|
||||||
|
@ -37,6 +46,18 @@ func NewTermsFacetBuilder(field string, size int) *TermsFacetBuilder {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (fb *TermsFacetBuilder) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr +
|
||||||
|
len(fb.field)
|
||||||
|
|
||||||
|
for k, _ := range fb.termsCount {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) +
|
||||||
|
size.SizeOfInt
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (fb *TermsFacetBuilder) Field() string {
|
func (fb *TermsFacetBuilder) Field() string {
|
||||||
return fb.field
|
return fb.field
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,11 +15,32 @@
|
||||||
package search
|
package search
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeFacetsBuilder int
|
||||||
|
var reflectStaticSizeFacetResult int
|
||||||
|
var reflectStaticSizeTermFacet int
|
||||||
|
var reflectStaticSizeNumericRangeFacet int
|
||||||
|
var reflectStaticSizeDateRangeFacet int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var fb FacetsBuilder
|
||||||
|
reflectStaticSizeFacetsBuilder = int(reflect.TypeOf(fb).Size())
|
||||||
|
var fr FacetResult
|
||||||
|
reflectStaticSizeFacetResult = int(reflect.TypeOf(fr).Size())
|
||||||
|
var tf TermFacet
|
||||||
|
reflectStaticSizeTermFacet = int(reflect.TypeOf(tf).Size())
|
||||||
|
var nrf NumericRangeFacet
|
||||||
|
reflectStaticSizeNumericRangeFacet = int(reflect.TypeOf(nrf).Size())
|
||||||
|
var drf DateRangeFacet
|
||||||
|
reflectStaticSizeDateRangeFacet = int(reflect.TypeOf(drf).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type FacetBuilder interface {
|
type FacetBuilder interface {
|
||||||
StartDoc()
|
StartDoc()
|
||||||
UpdateVisitor(field string, term []byte)
|
UpdateVisitor(field string, term []byte)
|
||||||
|
@ -27,23 +48,40 @@ type FacetBuilder interface {
|
||||||
|
|
||||||
Result() *FacetResult
|
Result() *FacetResult
|
||||||
Field() string
|
Field() string
|
||||||
|
|
||||||
|
Size() int
|
||||||
}
|
}
|
||||||
|
|
||||||
type FacetsBuilder struct {
|
type FacetsBuilder struct {
|
||||||
indexReader index.IndexReader
|
indexReader index.IndexReader
|
||||||
facets map[string]FacetBuilder
|
facetNames []string
|
||||||
|
facets []FacetBuilder
|
||||||
fields []string
|
fields []string
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder {
|
func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder {
|
||||||
return &FacetsBuilder{
|
return &FacetsBuilder{
|
||||||
indexReader: indexReader,
|
indexReader: indexReader,
|
||||||
facets: make(map[string]FacetBuilder, 0),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (fb *FacetsBuilder) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr
|
||||||
|
|
||||||
|
for k, v := range fb.facets {
|
||||||
|
sizeInBytes += size.SizeOfString + v.Size() + len(fb.facetNames[k])
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range fb.fields {
|
||||||
|
sizeInBytes += size.SizeOfString + len(entry)
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) {
|
func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) {
|
||||||
fb.facets[name] = facetBuilder
|
fb.facetNames = append(fb.facetNames, name)
|
||||||
|
fb.facets = append(fb.facets, facetBuilder)
|
||||||
fb.fields = append(fb.fields, facetBuilder.Field())
|
fb.fields = append(fb.fields, facetBuilder.Field())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -213,6 +251,14 @@ type FacetResult struct {
|
||||||
DateRanges DateRangeFacets `json:"date_ranges,omitempty"`
|
DateRanges DateRangeFacets `json:"date_ranges,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (fr *FacetResult) Size() int {
|
||||||
|
return reflectStaticSizeFacetResult + size.SizeOfPtr +
|
||||||
|
len(fr.Field) +
|
||||||
|
len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) +
|
||||||
|
len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) +
|
||||||
|
len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr)
|
||||||
|
}
|
||||||
|
|
||||||
func (fr *FacetResult) Merge(other *FacetResult) {
|
func (fr *FacetResult) Merge(other *FacetResult) {
|
||||||
fr.Total += other.Total
|
fr.Total += other.Total
|
||||||
fr.Missing += other.Missing
|
fr.Missing += other.Missing
|
||||||
|
@ -287,9 +333,9 @@ func (fr FacetResults) Fixup(name string, size int) {
|
||||||
|
|
||||||
func (fb *FacetsBuilder) Results() FacetResults {
|
func (fb *FacetsBuilder) Results() FacetResults {
|
||||||
fr := make(FacetResults)
|
fr := make(FacetResults)
|
||||||
for facetName, facetBuilder := range fb.facets {
|
for i, facetBuilder := range fb.facets {
|
||||||
facetResult := facetBuilder.Result()
|
facetResult := facetBuilder.Result()
|
||||||
fr[facetName] = facetResult
|
fr[fb.facetNames[i]] = facetResult
|
||||||
}
|
}
|
||||||
return fr
|
return fr
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,15 +57,24 @@ func LevenshteinDistance(a, b string) int {
|
||||||
// in which case the first return val will be the max
|
// in which case the first return val will be the max
|
||||||
// and the second will be true, indicating max was exceeded
|
// and the second will be true, indicating max was exceeded
|
||||||
func LevenshteinDistanceMax(a, b string, max int) (int, bool) {
|
func LevenshteinDistanceMax(a, b string, max int) (int, bool) {
|
||||||
|
v, wasMax, _ := LevenshteinDistanceMaxReuseSlice(a, b, max, nil)
|
||||||
|
return v, wasMax
|
||||||
|
}
|
||||||
|
|
||||||
|
func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, []int) {
|
||||||
la := len(a)
|
la := len(a)
|
||||||
lb := len(b)
|
lb := len(b)
|
||||||
|
|
||||||
ld := int(math.Abs(float64(la - lb)))
|
ld := int(math.Abs(float64(la - lb)))
|
||||||
if ld > max {
|
if ld > max {
|
||||||
return max, true
|
return max, true, d
|
||||||
}
|
}
|
||||||
|
|
||||||
d := make([]int, la+1)
|
if cap(d) < la+1 {
|
||||||
|
d = make([]int, la+1)
|
||||||
|
}
|
||||||
|
d = d[:la+1]
|
||||||
|
|
||||||
var lastdiag, olddiag, temp int
|
var lastdiag, olddiag, temp int
|
||||||
|
|
||||||
for i := 1; i <= la; i++ {
|
for i := 1; i <= la; i++ {
|
||||||
|
@ -98,8 +107,8 @@ func LevenshteinDistanceMax(a, b string, max int) (int, bool) {
|
||||||
}
|
}
|
||||||
// after each row if rowmin isn't less than max stop
|
// after each row if rowmin isn't less than max stop
|
||||||
if rowmin > max {
|
if rowmin > max {
|
||||||
return max, true
|
return max, true, d
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return d[la], false
|
return d[la], false, d
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,6 +14,17 @@
|
||||||
|
|
||||||
package search
|
package search
|
||||||
|
|
||||||
|
import (
|
||||||
|
"reflect"
|
||||||
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeDocumentMatchPool int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var dmp DocumentMatchPool
|
||||||
|
reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size())
|
||||||
|
}
|
||||||
|
|
||||||
// DocumentMatchPoolTooSmall is a callback function that can be executed
|
// DocumentMatchPoolTooSmall is a callback function that can be executed
|
||||||
// when the DocumentMatchPool does not have sufficient capacity
|
// when the DocumentMatchPool does not have sufficient capacity
|
||||||
// By default we just perform just-in-time allocation, but you could log
|
// By default we just perform just-in-time allocation, but you could log
|
||||||
|
|
|
@ -70,9 +70,11 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
|
||||||
}
|
}
|
||||||
ss = append(ss, sr)
|
ss = append(ss, sr)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(ss) < 1 {
|
if len(ss) < 1 {
|
||||||
return searcher.NewMatchNoneSearcher(i)
|
return searcher.NewMatchNoneSearcher(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
return searcher.NewConjunctionSearcher(i, ss, options)
|
return searcher.NewConjunctionSearcher(i, ss, options)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -58,7 +58,8 @@ func (q *DisjunctionQuery) SetMin(m float64) {
|
||||||
q.Min = m
|
q.Min = m
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
|
func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
|
||||||
|
options search.SearcherOptions) (search.Searcher, error) {
|
||||||
ss := make([]search.Searcher, 0, len(q.Disjuncts))
|
ss := make([]search.Searcher, 0, len(q.Disjuncts))
|
||||||
for _, disjunct := range q.Disjuncts {
|
for _, disjunct := range q.Disjuncts {
|
||||||
sr, err := disjunct.Searcher(i, m, options)
|
sr, err := disjunct.Searcher(i, m, options)
|
||||||
|
@ -76,9 +77,17 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping,
|
||||||
}
|
}
|
||||||
ss = append(ss, sr)
|
ss = append(ss, sr)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(ss) < 1 {
|
if len(ss) < 1 {
|
||||||
return searcher.NewMatchNoneSearcher(i)
|
return searcher.NewMatchNoneSearcher(i)
|
||||||
|
} else if len(ss) == 1 && int(q.Min) == ss[0].Min() {
|
||||||
|
// apply optimization only if both conditions below are satisfied:
|
||||||
|
// - disjunction searcher has only 1 child searcher
|
||||||
|
// - parent searcher's min setting is equal to child searcher's min
|
||||||
|
|
||||||
|
return ss[0], nil
|
||||||
}
|
}
|
||||||
|
|
||||||
return searcher.NewDisjunctionSearcher(i, ss, q.Min, options)
|
return searcher.NewDisjunctionSearcher(i, ss, q.Min, options)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -296,32 +296,28 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
expand = func(query Query) (Query, error) {
|
expand = func(query Query) (Query, error) {
|
||||||
switch query.(type) {
|
switch q := query.(type) {
|
||||||
case *QueryStringQuery:
|
case *QueryStringQuery:
|
||||||
q := query.(*QueryStringQuery)
|
|
||||||
parsed, err := parseQuerySyntax(q.Query)
|
parsed, err := parseQuerySyntax(q.Query)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err)
|
return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err)
|
||||||
}
|
}
|
||||||
return expand(parsed)
|
return expand(parsed)
|
||||||
case *ConjunctionQuery:
|
case *ConjunctionQuery:
|
||||||
q := *query.(*ConjunctionQuery)
|
|
||||||
children, err := expandSlice(q.Conjuncts)
|
children, err := expandSlice(q.Conjuncts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
q.Conjuncts = children
|
q.Conjuncts = children
|
||||||
return &q, nil
|
return q, nil
|
||||||
case *DisjunctionQuery:
|
case *DisjunctionQuery:
|
||||||
q := *query.(*DisjunctionQuery)
|
|
||||||
children, err := expandSlice(q.Disjuncts)
|
children, err := expandSlice(q.Disjuncts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
q.Disjuncts = children
|
q.Disjuncts = children
|
||||||
return &q, nil
|
return q, nil
|
||||||
case *BooleanQuery:
|
case *BooleanQuery:
|
||||||
q := *query.(*BooleanQuery)
|
|
||||||
var err error
|
var err error
|
||||||
q.Must, err = expand(q.Must)
|
q.Must, err = expand(q.Must)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -335,7 +331,7 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
return &q, nil
|
return q, nil
|
||||||
default:
|
default:
|
||||||
return query, nil
|
return query, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -273,6 +273,7 @@ func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) {
|
||||||
// see where to go
|
// see where to go
|
||||||
if !l.seenDot && next == '.' {
|
if !l.seenDot && next == '.' {
|
||||||
// stay in this state
|
// stay in this state
|
||||||
|
l.seenDot = true
|
||||||
l.buf += string(next)
|
l.buf += string(next)
|
||||||
return inNumOrStrState, true
|
return inNumOrStrState, true
|
||||||
} else if unicode.IsDigit(next) {
|
} else if unicode.IsDigit(next) {
|
||||||
|
|
|
@ -15,7 +15,6 @@
|
||||||
package query
|
package query
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"regexp"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
|
@ -28,7 +27,6 @@ type RegexpQuery struct {
|
||||||
Regexp string `json:"regexp"`
|
Regexp string `json:"regexp"`
|
||||||
FieldVal string `json:"field,omitempty"`
|
FieldVal string `json:"field,omitempty"`
|
||||||
BoostVal *Boost `json:"boost,omitempty"`
|
BoostVal *Boost `json:"boost,omitempty"`
|
||||||
compiled *regexp.Regexp
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewRegexpQuery creates a new Query which finds
|
// NewRegexpQuery creates a new Query which finds
|
||||||
|
@ -64,33 +62,20 @@ func (q *RegexpQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, opti
|
||||||
if q.FieldVal == "" {
|
if q.FieldVal == "" {
|
||||||
field = m.DefaultSearchField()
|
field = m.DefaultSearchField()
|
||||||
}
|
}
|
||||||
err := q.compile()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options)
|
// require that pattern NOT be anchored to start and end of term.
|
||||||
}
|
// do not attempt to remove trailing $, its presence is not
|
||||||
|
// known to interfere with LiteralPrefix() the way ^ does
|
||||||
func (q *RegexpQuery) Validate() error {
|
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
|
||||||
return q.compile()
|
|
||||||
}
|
|
||||||
|
|
||||||
func (q *RegexpQuery) compile() error {
|
|
||||||
if q.compiled == nil {
|
|
||||||
// require that pattern NOT be anchored to start and end of term
|
|
||||||
actualRegexp := q.Regexp
|
actualRegexp := q.Regexp
|
||||||
if strings.HasPrefix(actualRegexp, "^") {
|
if strings.HasPrefix(actualRegexp, "^") {
|
||||||
actualRegexp = actualRegexp[1:] // remove leading ^
|
actualRegexp = actualRegexp[1:] // remove leading ^
|
||||||
}
|
}
|
||||||
// do not attempt to remove trailing $, it's presence is not
|
|
||||||
// known to interfere with LiteralPrefix() the way ^ does
|
return searcher.NewRegexpStringSearcher(i, actualRegexp, field,
|
||||||
// and removing $ introduces possible ambiguities with escaped \$, \\$, etc
|
q.BoostVal.Value(), options)
|
||||||
var err error
|
}
|
||||||
q.compiled, err = regexp.Compile(actualRegexp)
|
|
||||||
if err != nil {
|
func (q *RegexpQuery) Validate() error {
|
||||||
return err
|
return nil // real validation delayed until searcher constructor
|
||||||
}
|
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,6 @@
|
||||||
package query
|
package query
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"regexp"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
|
@ -47,7 +46,6 @@ type WildcardQuery struct {
|
||||||
Wildcard string `json:"wildcard"`
|
Wildcard string `json:"wildcard"`
|
||||||
FieldVal string `json:"field,omitempty"`
|
FieldVal string `json:"field,omitempty"`
|
||||||
BoostVal *Boost `json:"boost,omitempty"`
|
BoostVal *Boost `json:"boost,omitempty"`
|
||||||
compiled *regexp.Regexp
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewWildcardQuery creates a new Query which finds
|
// NewWildcardQuery creates a new Query which finds
|
||||||
|
@ -83,24 +81,13 @@ func (q *WildcardQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, op
|
||||||
if q.FieldVal == "" {
|
if q.FieldVal == "" {
|
||||||
field = m.DefaultSearchField()
|
field = m.DefaultSearchField()
|
||||||
}
|
}
|
||||||
if q.compiled == nil {
|
|
||||||
var err error
|
|
||||||
q.compiled, err = q.convertToRegexp()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options)
|
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
|
||||||
|
|
||||||
|
return searcher.NewRegexpStringSearcher(i, regexpString, field,
|
||||||
|
q.BoostVal.Value(), options)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (q *WildcardQuery) Validate() error {
|
func (q *WildcardQuery) Validate() error {
|
||||||
var err error
|
return nil // real validation delayed until searcher constructor
|
||||||
q.compiled, err = q.convertToRegexp()
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) {
|
|
||||||
regexpString := wildcardRegexpReplacer.Replace(q.Wildcard)
|
|
||||||
return regexp.Compile(regexpString)
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,13 +15,27 @@
|
||||||
package scorer
|
package scorer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeConjunctionQueryScorer int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var cqs ConjunctionQueryScorer
|
||||||
|
reflectStaticSizeConjunctionQueryScorer = int(reflect.TypeOf(cqs).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type ConjunctionQueryScorer struct {
|
type ConjunctionQueryScorer struct {
|
||||||
options search.SearcherOptions
|
options search.SearcherOptions
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *ConjunctionQueryScorer) Size() int {
|
||||||
|
return reflectStaticSizeConjunctionQueryScorer + size.SizeOfPtr
|
||||||
|
}
|
||||||
|
|
||||||
func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer {
|
func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer {
|
||||||
return &ConjunctionQueryScorer{
|
return &ConjunctionQueryScorer{
|
||||||
options: options,
|
options: options,
|
||||||
|
@ -35,15 +49,11 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
|
||||||
childrenExplanations = make([]*search.Explanation, len(constituents))
|
childrenExplanations = make([]*search.Explanation, len(constituents))
|
||||||
}
|
}
|
||||||
|
|
||||||
locations := []search.FieldTermLocationMap{}
|
|
||||||
for i, docMatch := range constituents {
|
for i, docMatch := range constituents {
|
||||||
sum += docMatch.Score
|
sum += docMatch.Score
|
||||||
if s.options.Explain {
|
if s.options.Explain {
|
||||||
childrenExplanations[i] = docMatch.Expl
|
childrenExplanations[i] = docMatch.Expl
|
||||||
}
|
}
|
||||||
if docMatch.Locations != nil {
|
|
||||||
locations = append(locations, docMatch.Locations)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
newScore := sum
|
newScore := sum
|
||||||
var newExpl *search.Explanation
|
var newExpl *search.Explanation
|
||||||
|
@ -55,11 +65,8 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
|
||||||
rv := constituents[0]
|
rv := constituents[0]
|
||||||
rv.Score = newScore
|
rv.Score = newScore
|
||||||
rv.Expl = newExpl
|
rv.Expl = newExpl
|
||||||
if len(locations) == 1 {
|
rv.FieldTermLocations = search.MergeFieldTermLocations(
|
||||||
rv.Locations = locations[0]
|
rv.FieldTermLocations, constituents[1:])
|
||||||
} else if len(locations) > 1 {
|
|
||||||
rv.Locations = search.MergeLocations(locations)
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,11 +16,20 @@ package scorer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeConstantScorer int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var cs ConstantScorer
|
||||||
|
reflectStaticSizeConstantScorer = int(reflect.TypeOf(cs).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type ConstantScorer struct {
|
type ConstantScorer struct {
|
||||||
constant float64
|
constant float64
|
||||||
boost float64
|
boost float64
|
||||||
|
@ -30,6 +39,16 @@ type ConstantScorer struct {
|
||||||
queryWeightExplanation *search.Explanation
|
queryWeightExplanation *search.Explanation
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *ConstantScorer) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeConstantScorer + size.SizeOfPtr
|
||||||
|
|
||||||
|
if s.queryWeightExplanation != nil {
|
||||||
|
sizeInBytes += s.queryWeightExplanation.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer {
|
func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer {
|
||||||
rv := ConstantScorer{
|
rv := ConstantScorer{
|
||||||
options: options,
|
options: options,
|
||||||
|
|
|
@ -16,14 +16,27 @@ package scorer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeDisjunctionQueryScorer int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var dqs DisjunctionQueryScorer
|
||||||
|
reflectStaticSizeDisjunctionQueryScorer = int(reflect.TypeOf(dqs).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type DisjunctionQueryScorer struct {
|
type DisjunctionQueryScorer struct {
|
||||||
options search.SearcherOptions
|
options search.SearcherOptions
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionQueryScorer) Size() int {
|
||||||
|
return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr
|
||||||
|
}
|
||||||
|
|
||||||
func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer {
|
func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer {
|
||||||
return &DisjunctionQueryScorer{
|
return &DisjunctionQueryScorer{
|
||||||
options: options,
|
options: options,
|
||||||
|
@ -37,15 +50,11 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
|
||||||
childrenExplanations = make([]*search.Explanation, len(constituents))
|
childrenExplanations = make([]*search.Explanation, len(constituents))
|
||||||
}
|
}
|
||||||
|
|
||||||
var locations []search.FieldTermLocationMap
|
|
||||||
for i, docMatch := range constituents {
|
for i, docMatch := range constituents {
|
||||||
sum += docMatch.Score
|
sum += docMatch.Score
|
||||||
if s.options.Explain {
|
if s.options.Explain {
|
||||||
childrenExplanations[i] = docMatch.Expl
|
childrenExplanations[i] = docMatch.Expl
|
||||||
}
|
}
|
||||||
if docMatch.Locations != nil {
|
|
||||||
locations = append(locations, docMatch.Locations)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var rawExpl *search.Explanation
|
var rawExpl *search.Explanation
|
||||||
|
@ -67,11 +76,8 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [
|
||||||
rv := constituents[0]
|
rv := constituents[0]
|
||||||
rv.Score = newScore
|
rv.Score = newScore
|
||||||
rv.Expl = newExpl
|
rv.Expl = newExpl
|
||||||
if len(locations) == 1 {
|
rv.FieldTermLocations = search.MergeFieldTermLocations(
|
||||||
rv.Locations = locations[0]
|
rv.FieldTermLocations, constituents[1:])
|
||||||
} else if len(locations) > 1 {
|
|
||||||
rv.Locations = search.MergeLocations(locations)
|
|
||||||
}
|
|
||||||
|
|
||||||
return rv
|
return rv
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,13 +17,22 @@ package scorer
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeTermQueryScorer int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var tqs TermQueryScorer
|
||||||
|
reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type TermQueryScorer struct {
|
type TermQueryScorer struct {
|
||||||
queryTerm []byte
|
queryTerm string
|
||||||
queryField string
|
queryField string
|
||||||
queryBoost float64
|
queryBoost float64
|
||||||
docTerm uint64
|
docTerm uint64
|
||||||
|
@ -36,9 +45,24 @@ type TermQueryScorer struct {
|
||||||
queryWeightExplanation *search.Explanation
|
queryWeightExplanation *search.Explanation
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *TermQueryScorer) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr +
|
||||||
|
len(s.queryTerm) + len(s.queryField)
|
||||||
|
|
||||||
|
if s.idfExplanation != nil {
|
||||||
|
sizeInBytes += s.idfExplanation.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.queryWeightExplanation != nil {
|
||||||
|
sizeInBytes += s.queryWeightExplanation.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer {
|
func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer {
|
||||||
rv := TermQueryScorer{
|
rv := TermQueryScorer{
|
||||||
queryTerm: queryTerm,
|
queryTerm: string(queryTerm),
|
||||||
queryField: queryField,
|
queryField: queryField,
|
||||||
queryBoost: queryBoost,
|
queryBoost: queryBoost,
|
||||||
docTerm: docTerm,
|
docTerm: docTerm,
|
||||||
|
@ -82,7 +106,7 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) {
|
||||||
}
|
}
|
||||||
s.queryWeightExplanation = &search.Explanation{
|
s.queryWeightExplanation = &search.Explanation{
|
||||||
Value: s.queryWeight,
|
Value: s.queryWeight,
|
||||||
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, string(s.queryTerm), s.queryBoost),
|
Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost),
|
||||||
Children: childrenExplanations,
|
Children: childrenExplanations,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -104,7 +128,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
|
||||||
childrenExplanations := make([]*search.Explanation, 3)
|
childrenExplanations := make([]*search.Explanation, 3)
|
||||||
childrenExplanations[0] = &search.Explanation{
|
childrenExplanations[0] = &search.Explanation{
|
||||||
Value: tf,
|
Value: tf,
|
||||||
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, string(s.queryTerm), termMatch.Freq),
|
Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq),
|
||||||
}
|
}
|
||||||
childrenExplanations[1] = &search.Explanation{
|
childrenExplanations[1] = &search.Explanation{
|
||||||
Value: termMatch.Norm,
|
Value: termMatch.Norm,
|
||||||
|
@ -113,7 +137,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
|
||||||
childrenExplanations[2] = s.idfExplanation
|
childrenExplanations[2] = s.idfExplanation
|
||||||
scoreExplanation = &search.Explanation{
|
scoreExplanation = &search.Explanation{
|
||||||
Value: score,
|
Value: score,
|
||||||
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, string(s.queryTerm), termMatch.ID),
|
Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID),
|
||||||
Children: childrenExplanations,
|
Children: childrenExplanations,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -127,7 +151,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
|
||||||
childExplanations[1] = scoreExplanation
|
childExplanations[1] = scoreExplanation
|
||||||
scoreExplanation = &search.Explanation{
|
scoreExplanation = &search.Explanation{
|
||||||
Value: score,
|
Value: score,
|
||||||
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, string(s.queryTerm), s.queryBoost, termMatch.ID),
|
Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID),
|
||||||
Children: childExplanations,
|
Children: childExplanations,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -140,41 +164,31 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term
|
||||||
rv.Expl = scoreExplanation
|
rv.Expl = scoreExplanation
|
||||||
}
|
}
|
||||||
|
|
||||||
if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 {
|
if len(termMatch.Vectors) > 0 {
|
||||||
locs := make([]search.Location, len(termMatch.Vectors))
|
if cap(rv.FieldTermLocations) < len(termMatch.Vectors) {
|
||||||
locsUsed := 0
|
rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors))
|
||||||
|
|
||||||
totalPositions := 0
|
|
||||||
for _, v := range termMatch.Vectors {
|
|
||||||
totalPositions += len(v.ArrayPositions)
|
|
||||||
}
|
|
||||||
positions := make(search.ArrayPositions, totalPositions)
|
|
||||||
positionsUsed := 0
|
|
||||||
|
|
||||||
rv.Locations = make(search.FieldTermLocationMap)
|
|
||||||
for _, v := range termMatch.Vectors {
|
|
||||||
tlm := rv.Locations[v.Field]
|
|
||||||
if tlm == nil {
|
|
||||||
tlm = make(search.TermLocationMap)
|
|
||||||
rv.Locations[v.Field] = tlm
|
|
||||||
}
|
}
|
||||||
|
|
||||||
loc := &locs[locsUsed]
|
for _, v := range termMatch.Vectors {
|
||||||
locsUsed++
|
var ap search.ArrayPositions
|
||||||
|
|
||||||
loc.Pos = v.Pos
|
|
||||||
loc.Start = v.Start
|
|
||||||
loc.End = v.End
|
|
||||||
|
|
||||||
if len(v.ArrayPositions) > 0 {
|
if len(v.ArrayPositions) > 0 {
|
||||||
loc.ArrayPositions = positions[positionsUsed : positionsUsed+len(v.ArrayPositions)]
|
n := len(rv.FieldTermLocations)
|
||||||
for i, ap := range v.ArrayPositions {
|
if n < cap(rv.FieldTermLocations) { // reuse ap slice if available
|
||||||
loc.ArrayPositions[i] = ap
|
ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0]
|
||||||
}
|
}
|
||||||
positionsUsed += len(v.ArrayPositions)
|
ap = append(ap, v.ArrayPositions...)
|
||||||
}
|
}
|
||||||
|
rv.FieldTermLocations =
|
||||||
tlm[string(s.queryTerm)] = append(tlm[string(s.queryTerm)], loc)
|
append(rv.FieldTermLocations, search.FieldTermLocation{
|
||||||
|
Field: v.Field,
|
||||||
|
Term: s.queryTerm,
|
||||||
|
Location: search.Location{
|
||||||
|
Pos: v.Pos,
|
||||||
|
Start: v.Start,
|
||||||
|
End: v.End,
|
||||||
|
ArrayPositions: ap,
|
||||||
|
},
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,11 +16,25 @@ package search
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/document"
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeDocumentMatch int
|
||||||
|
var reflectStaticSizeSearchContext int
|
||||||
|
var reflectStaticSizeLocation int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var dm DocumentMatch
|
||||||
|
reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size())
|
||||||
|
var sc SearchContext
|
||||||
|
reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size())
|
||||||
|
var l Location
|
||||||
|
reflectStaticSizeLocation = int(reflect.TypeOf(l).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type ArrayPositions []uint64
|
type ArrayPositions []uint64
|
||||||
|
|
||||||
func (ap ArrayPositions) Equals(other ArrayPositions) bool {
|
func (ap ArrayPositions) Equals(other ArrayPositions) bool {
|
||||||
|
@ -47,6 +61,11 @@ type Location struct {
|
||||||
ArrayPositions ArrayPositions `json:"array_positions"`
|
ArrayPositions ArrayPositions `json:"array_positions"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (l *Location) Size() int {
|
||||||
|
return reflectStaticSizeLocation + size.SizeOfPtr +
|
||||||
|
len(l.ArrayPositions)*size.SizeOfUint64
|
||||||
|
}
|
||||||
|
|
||||||
type Locations []*Location
|
type Locations []*Location
|
||||||
|
|
||||||
type TermLocationMap map[string]Locations
|
type TermLocationMap map[string]Locations
|
||||||
|
@ -57,6 +76,12 @@ func (t TermLocationMap) AddLocation(term string, location *Location) {
|
||||||
|
|
||||||
type FieldTermLocationMap map[string]TermLocationMap
|
type FieldTermLocationMap map[string]TermLocationMap
|
||||||
|
|
||||||
|
type FieldTermLocation struct {
|
||||||
|
Field string
|
||||||
|
Term string
|
||||||
|
Location Location
|
||||||
|
}
|
||||||
|
|
||||||
type FieldFragmentMap map[string][]string
|
type FieldFragmentMap map[string][]string
|
||||||
|
|
||||||
type DocumentMatch struct {
|
type DocumentMatch struct {
|
||||||
|
@ -74,11 +99,14 @@ type DocumentMatch struct {
|
||||||
// fields as float64s and date fields as time.RFC3339 formatted strings.
|
// fields as float64s and date fields as time.RFC3339 formatted strings.
|
||||||
Fields map[string]interface{} `json:"fields,omitempty"`
|
Fields map[string]interface{} `json:"fields,omitempty"`
|
||||||
|
|
||||||
// if we load the document for this hit, remember it so we dont load again
|
|
||||||
Document *document.Document `json:"-"`
|
|
||||||
|
|
||||||
// used to maintain natural index order
|
// used to maintain natural index order
|
||||||
HitNumber uint64 `json:"-"`
|
HitNumber uint64 `json:"-"`
|
||||||
|
|
||||||
|
// used to temporarily hold field term location information during
|
||||||
|
// search processing in an efficient, recycle-friendly manner, to
|
||||||
|
// be later incorporated into the Locations map when search
|
||||||
|
// results are completed
|
||||||
|
FieldTermLocations []FieldTermLocation `json:"-"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {
|
func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) {
|
||||||
|
@ -108,15 +136,116 @@ func (dm *DocumentMatch) Reset() *DocumentMatch {
|
||||||
indexInternalID := dm.IndexInternalID
|
indexInternalID := dm.IndexInternalID
|
||||||
// remember the []interface{} used for sort
|
// remember the []interface{} used for sort
|
||||||
sort := dm.Sort
|
sort := dm.Sort
|
||||||
|
// remember the FieldTermLocations backing array
|
||||||
|
ftls := dm.FieldTermLocations
|
||||||
|
for i := range ftls { // recycle the ArrayPositions of each location
|
||||||
|
ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0]
|
||||||
|
}
|
||||||
// idiom to copy over from empty DocumentMatch (0 allocations)
|
// idiom to copy over from empty DocumentMatch (0 allocations)
|
||||||
*dm = DocumentMatch{}
|
*dm = DocumentMatch{}
|
||||||
// reuse the []byte already allocated (and reset len to 0)
|
// reuse the []byte already allocated (and reset len to 0)
|
||||||
dm.IndexInternalID = indexInternalID[:0]
|
dm.IndexInternalID = indexInternalID[:0]
|
||||||
// reuse the []interface{} already allocated (and reset len to 0)
|
// reuse the []interface{} already allocated (and reset len to 0)
|
||||||
dm.Sort = sort[:0]
|
dm.Sort = sort[:0]
|
||||||
|
// reuse the FieldTermLocations already allocated (and reset len to 0)
|
||||||
|
dm.FieldTermLocations = ftls[:0]
|
||||||
return dm
|
return dm
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (dm *DocumentMatch) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeDocumentMatch + size.SizeOfPtr +
|
||||||
|
len(dm.Index) +
|
||||||
|
len(dm.ID) +
|
||||||
|
len(dm.IndexInternalID)
|
||||||
|
|
||||||
|
if dm.Expl != nil {
|
||||||
|
sizeInBytes += dm.Expl.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v := range dm.Locations {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k)
|
||||||
|
for k1, v1 := range v {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k1) +
|
||||||
|
size.SizeOfSlice
|
||||||
|
for _, entry := range v1 {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, v := range dm.Fragments {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) +
|
||||||
|
size.SizeOfSlice
|
||||||
|
|
||||||
|
for _, entry := range v {
|
||||||
|
sizeInBytes += size.SizeOfString + len(entry)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range dm.Sort {
|
||||||
|
sizeInBytes += size.SizeOfString + len(entry)
|
||||||
|
}
|
||||||
|
|
||||||
|
for k, _ := range dm.Fields {
|
||||||
|
sizeInBytes += size.SizeOfString + len(k) +
|
||||||
|
size.SizeOfPtr
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
|
// Complete performs final preparation & transformation of the
|
||||||
|
// DocumentMatch at the end of search processing, also allowing the
|
||||||
|
// caller to provide an optional preallocated locations slice
|
||||||
|
func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
|
||||||
|
// transform the FieldTermLocations slice into the Locations map
|
||||||
|
nlocs := len(dm.FieldTermLocations)
|
||||||
|
if nlocs > 0 {
|
||||||
|
if cap(prealloc) < nlocs {
|
||||||
|
prealloc = make([]Location, nlocs)
|
||||||
|
}
|
||||||
|
prealloc = prealloc[:nlocs]
|
||||||
|
|
||||||
|
var lastField string
|
||||||
|
var tlm TermLocationMap
|
||||||
|
|
||||||
|
for i, ftl := range dm.FieldTermLocations {
|
||||||
|
if lastField != ftl.Field {
|
||||||
|
lastField = ftl.Field
|
||||||
|
|
||||||
|
if dm.Locations == nil {
|
||||||
|
dm.Locations = make(FieldTermLocationMap)
|
||||||
|
}
|
||||||
|
|
||||||
|
tlm = dm.Locations[ftl.Field]
|
||||||
|
if tlm == nil {
|
||||||
|
tlm = make(TermLocationMap)
|
||||||
|
dm.Locations[ftl.Field] = tlm
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
loc := &prealloc[i]
|
||||||
|
*loc = ftl.Location
|
||||||
|
|
||||||
|
if len(loc.ArrayPositions) > 0 { // copy
|
||||||
|
loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...)
|
||||||
|
}
|
||||||
|
|
||||||
|
tlm[ftl.Term] = append(tlm[ftl.Term], loc)
|
||||||
|
|
||||||
|
dm.FieldTermLocations[i] = FieldTermLocation{ // recycle
|
||||||
|
Location: Location{
|
||||||
|
ArrayPositions: ftl.Location.ArrayPositions[:0],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle
|
||||||
|
|
||||||
|
return prealloc
|
||||||
|
}
|
||||||
|
|
||||||
func (dm *DocumentMatch) String() string {
|
func (dm *DocumentMatch) String() string {
|
||||||
return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)
|
return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score)
|
||||||
}
|
}
|
||||||
|
@ -135,6 +264,7 @@ type Searcher interface {
|
||||||
SetQueryNorm(float64)
|
SetQueryNorm(float64)
|
||||||
Count() uint64
|
Count() uint64
|
||||||
Min() int
|
Min() int
|
||||||
|
Size() int
|
||||||
|
|
||||||
DocumentMatchPoolSize() int
|
DocumentMatchPoolSize() int
|
||||||
}
|
}
|
||||||
|
@ -142,9 +272,26 @@ type Searcher interface {
|
||||||
type SearcherOptions struct {
|
type SearcherOptions struct {
|
||||||
Explain bool
|
Explain bool
|
||||||
IncludeTermVectors bool
|
IncludeTermVectors bool
|
||||||
|
Score string
|
||||||
}
|
}
|
||||||
|
|
||||||
// SearchContext represents the context around a single search
|
// SearchContext represents the context around a single search
|
||||||
type SearchContext struct {
|
type SearchContext struct {
|
||||||
DocumentMatchPool *DocumentMatchPool
|
DocumentMatchPool *DocumentMatchPool
|
||||||
|
Collector Collector
|
||||||
|
}
|
||||||
|
|
||||||
|
func (sc *SearchContext) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeSearchContext + size.SizeOfPtr +
|
||||||
|
reflectStaticSizeDocumentMatchPool + size.SizeOfPtr
|
||||||
|
|
||||||
|
if sc.DocumentMatchPool != nil {
|
||||||
|
for _, entry := range sc.DocumentMatchPool.avail {
|
||||||
|
if entry != nil {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,12 +16,21 @@ package searcher
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"math"
|
"math"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
"github.com/blevesearch/bleve/search/scorer"
|
"github.com/blevesearch/bleve/search/scorer"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeBooleanSearcher int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var bs BooleanSearcher
|
||||||
|
reflectStaticSizeBooleanSearcher = int(reflect.TypeOf(bs).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type BooleanSearcher struct {
|
type BooleanSearcher struct {
|
||||||
indexReader index.IndexReader
|
indexReader index.IndexReader
|
||||||
mustSearcher search.Searcher
|
mustSearcher search.Searcher
|
||||||
|
@ -52,6 +61,32 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc
|
||||||
return &rv, nil
|
return &rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *BooleanSearcher) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr
|
||||||
|
|
||||||
|
if s.mustSearcher != nil {
|
||||||
|
sizeInBytes += s.mustSearcher.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.shouldSearcher != nil {
|
||||||
|
sizeInBytes += s.shouldSearcher.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.mustNotSearcher != nil {
|
||||||
|
sizeInBytes += s.mustNotSearcher.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
sizeInBytes += s.scorer.Size()
|
||||||
|
|
||||||
|
for _, entry := range s.matches {
|
||||||
|
if entry != nil {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (s *BooleanSearcher) computeQueryNorm() {
|
func (s *BooleanSearcher) computeQueryNorm() {
|
||||||
// first calculate sum of squared weights
|
// first calculate sum of squared weights
|
||||||
sumOfSquaredWeights := 0.0
|
sumOfSquaredWeights := 0.0
|
||||||
|
@ -284,6 +319,7 @@ func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return rv, nil
|
return rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -296,6 +332,14 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Advance the searchers only if the currentID cursor is trailing the lookup ID,
|
||||||
|
// additionally if the mustNotSearcher has been initialized, ensure that the
|
||||||
|
// cursor used to track the mustNotSearcher (currMustNot, which isn't tracked by
|
||||||
|
// currentID) is trailing the lookup ID as well - for in the case where currentID
|
||||||
|
// is nil and currMustNot is already at or ahead of the lookup ID, we MUST NOT
|
||||||
|
// advance the currentID or the currMustNot cursors.
|
||||||
|
if (s.currentID == nil || s.currentID.Compare(ID) < 0) &&
|
||||||
|
(s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0) {
|
||||||
var err error
|
var err error
|
||||||
if s.mustSearcher != nil {
|
if s.mustSearcher != nil {
|
||||||
if s.currMust != nil {
|
if s.currMust != nil {
|
||||||
|
@ -306,6 +350,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.shouldSearcher != nil {
|
if s.shouldSearcher != nil {
|
||||||
if s.currShould != nil {
|
if s.currShould != nil {
|
||||||
ctx.DocumentMatchPool.Put(s.currShould)
|
ctx.DocumentMatchPool.Put(s.currShould)
|
||||||
|
@ -315,6 +360,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if s.mustNotSearcher != nil {
|
if s.mustNotSearcher != nil {
|
||||||
if s.currMustNot != nil {
|
if s.currMustNot != nil {
|
||||||
ctx.DocumentMatchPool.Put(s.currMustNot)
|
ctx.DocumentMatchPool.Put(s.currMustNot)
|
||||||
|
@ -332,6 +378,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter
|
||||||
} else {
|
} else {
|
||||||
s.currentID = nil
|
s.currentID = nil
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return s.Next(ctx)
|
return s.Next(ctx)
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,13 +16,22 @@ package searcher
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"math"
|
"math"
|
||||||
|
"reflect"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
"github.com/blevesearch/bleve/search/scorer"
|
"github.com/blevesearch/bleve/search/scorer"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeConjunctionSearcher int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var cs ConjunctionSearcher
|
||||||
|
reflectStaticSizeConjunctionSearcher = int(reflect.TypeOf(cs).Size())
|
||||||
|
}
|
||||||
|
|
||||||
type ConjunctionSearcher struct {
|
type ConjunctionSearcher struct {
|
||||||
indexReader index.IndexReader
|
indexReader index.IndexReader
|
||||||
searchers OrderedSearcherList
|
searchers OrderedSearcherList
|
||||||
|
@ -34,14 +43,27 @@ type ConjunctionSearcher struct {
|
||||||
options search.SearcherOptions
|
options search.SearcherOptions
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, options search.SearcherOptions) (*ConjunctionSearcher, error) {
|
func NewConjunctionSearcher(indexReader index.IndexReader,
|
||||||
// build the downstream searchers
|
qsearchers []search.Searcher, options search.SearcherOptions) (
|
||||||
|
search.Searcher, error) {
|
||||||
|
// build the sorted downstream searchers
|
||||||
searchers := make(OrderedSearcherList, len(qsearchers))
|
searchers := make(OrderedSearcherList, len(qsearchers))
|
||||||
for i, searcher := range qsearchers {
|
for i, searcher := range qsearchers {
|
||||||
searchers[i] = searcher
|
searchers[i] = searcher
|
||||||
}
|
}
|
||||||
// sort the searchers
|
|
||||||
sort.Sort(searchers)
|
sort.Sort(searchers)
|
||||||
|
|
||||||
|
// attempt the "unadorned" conjunction optimization only when we
|
||||||
|
// do not need extra information like freq-norm's or term vectors
|
||||||
|
if len(searchers) > 1 &&
|
||||||
|
options.Score == "none" && !options.IncludeTermVectors {
|
||||||
|
rv, err := optimizeCompositeSearcher("conjunction:unadorned",
|
||||||
|
indexReader, searchers, options)
|
||||||
|
if err != nil || rv != nil {
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// build our searcher
|
// build our searcher
|
||||||
rv := ConjunctionSearcher{
|
rv := ConjunctionSearcher{
|
||||||
indexReader: indexReader,
|
indexReader: indexReader,
|
||||||
|
@ -51,9 +73,36 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S
|
||||||
scorer: scorer.NewConjunctionQueryScorer(options),
|
scorer: scorer.NewConjunctionQueryScorer(options),
|
||||||
}
|
}
|
||||||
rv.computeQueryNorm()
|
rv.computeQueryNorm()
|
||||||
|
|
||||||
|
// attempt push-down conjunction optimization when there's >1 searchers
|
||||||
|
if len(searchers) > 1 {
|
||||||
|
rv, err := optimizeCompositeSearcher("conjunction",
|
||||||
|
indexReader, searchers, options)
|
||||||
|
if err != nil || rv != nil {
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return &rv, nil
|
return &rv, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *ConjunctionSearcher) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr +
|
||||||
|
s.scorer.Size()
|
||||||
|
|
||||||
|
for _, entry := range s.searchers {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range s.currs {
|
||||||
|
if entry != nil {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
func (s *ConjunctionSearcher) computeQueryNorm() {
|
func (s *ConjunctionSearcher) computeQueryNorm() {
|
||||||
// first calculate sum of squared weights
|
// first calculate sum of squared weights
|
||||||
sumOfSquaredWeights := 0.0
|
sumOfSquaredWeights := 0.0
|
||||||
|
@ -108,7 +157,7 @@ func (s *ConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentM
|
||||||
var rv *search.DocumentMatch
|
var rv *search.DocumentMatch
|
||||||
var err error
|
var err error
|
||||||
OUTER:
|
OUTER:
|
||||||
for s.currs[s.maxIDIdx] != nil {
|
for s.maxIDIdx < len(s.currs) && s.currs[s.maxIDIdx] != nil {
|
||||||
maxID := s.currs[s.maxIDIdx].IndexInternalID
|
maxID := s.currs[s.maxIDIdx].IndexInternalID
|
||||||
|
|
||||||
i := 0
|
i := 0
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
// Copyright (c) 2014 Couchbase, Inc.
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
//
|
//
|
||||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
// you may not use this file except in compliance with the License.
|
// you may not use this file except in compliance with the License.
|
||||||
|
@ -16,12 +16,9 @@ package searcher
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"math"
|
|
||||||
"sort"
|
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
"github.com/blevesearch/bleve/search/scorer"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// DisjunctionMaxClauseCount is a compile time setting that applications can
|
// DisjunctionMaxClauseCount is a compile time setting that applications can
|
||||||
|
@ -29,17 +26,74 @@ import (
|
||||||
// error instead of exeucting searches when the size exceeds this value.
|
// error instead of exeucting searches when the size exceeds this value.
|
||||||
var DisjunctionMaxClauseCount = 0
|
var DisjunctionMaxClauseCount = 0
|
||||||
|
|
||||||
type DisjunctionSearcher struct {
|
// DisjunctionHeapTakeover is a compile time setting that applications can
|
||||||
indexReader index.IndexReader
|
// adjust to control when the DisjunctionSearcher will switch from a simple
|
||||||
searchers OrderedSearcherList
|
// slice implementation to a heap implementation.
|
||||||
numSearchers int
|
var DisjunctionHeapTakeover = 10
|
||||||
queryNorm float64
|
|
||||||
currs []*search.DocumentMatch
|
func NewDisjunctionSearcher(indexReader index.IndexReader,
|
||||||
scorer *scorer.DisjunctionQueryScorer
|
qsearchers []search.Searcher, min float64, options search.SearcherOptions) (
|
||||||
min int
|
search.Searcher, error) {
|
||||||
matching []*search.DocumentMatch
|
return newDisjunctionSearcher(indexReader, qsearchers, min, options, true)
|
||||||
matchingIdxs []int
|
}
|
||||||
initialized bool
|
|
||||||
|
func newDisjunctionSearcher(indexReader index.IndexReader,
|
||||||
|
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
|
||||||
|
limit bool) (search.Searcher, error) {
|
||||||
|
// attempt the "unadorned" disjunction optimization only when we
|
||||||
|
// do not need extra information like freq-norm's or term vectors
|
||||||
|
// and the requested min is simple
|
||||||
|
if len(qsearchers) > 1 && min <= 1 &&
|
||||||
|
options.Score == "none" && !options.IncludeTermVectors {
|
||||||
|
rv, err := optimizeCompositeSearcher("disjunction:unadorned",
|
||||||
|
indexReader, qsearchers, options)
|
||||||
|
if err != nil || rv != nil {
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(qsearchers) > DisjunctionHeapTakeover {
|
||||||
|
return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options,
|
||||||
|
limit)
|
||||||
|
}
|
||||||
|
return newDisjunctionSliceSearcher(indexReader, qsearchers, min, options,
|
||||||
|
limit)
|
||||||
|
}
|
||||||
|
|
||||||
|
func optimizeCompositeSearcher(optimizationKind string,
|
||||||
|
indexReader index.IndexReader, qsearchers []search.Searcher,
|
||||||
|
options search.SearcherOptions) (search.Searcher, error) {
|
||||||
|
var octx index.OptimizableContext
|
||||||
|
|
||||||
|
for _, searcher := range qsearchers {
|
||||||
|
o, ok := searcher.(index.Optimizable)
|
||||||
|
if !ok {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var err error
|
||||||
|
octx, err = o.Optimize(optimizationKind, octx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if octx == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
optimized, err := octx.Finish()
|
||||||
|
if err != nil || optimized == nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
tfr, ok := optimized.(index.TermFieldReader)
|
||||||
|
if !ok {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return newTermSearcherFromReader(indexReader, tfr,
|
||||||
|
[]byte(optimizationKind), "*", 1.0, options)
|
||||||
}
|
}
|
||||||
|
|
||||||
func tooManyClauses(count int) bool {
|
func tooManyClauses(count int) bool {
|
||||||
|
@ -49,226 +103,7 @@ func tooManyClauses(count int) bool {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func tooManyClausesErr() error {
|
func tooManyClausesErr(count int) error {
|
||||||
return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]",
|
return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]",
|
||||||
DisjunctionMaxClauseCount)
|
count, DisjunctionMaxClauseCount)
|
||||||
}
|
|
||||||
|
|
||||||
func NewDisjunctionSearcher(indexReader index.IndexReader,
|
|
||||||
qsearchers []search.Searcher, min float64, options search.SearcherOptions) (
|
|
||||||
*DisjunctionSearcher, error) {
|
|
||||||
return newDisjunctionSearcher(indexReader, qsearchers, min, options,
|
|
||||||
true)
|
|
||||||
}
|
|
||||||
|
|
||||||
func newDisjunctionSearcher(indexReader index.IndexReader,
|
|
||||||
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
|
|
||||||
limit bool) (
|
|
||||||
*DisjunctionSearcher, error) {
|
|
||||||
if limit && tooManyClauses(len(qsearchers)) {
|
|
||||||
return nil, tooManyClausesErr()
|
|
||||||
}
|
|
||||||
// build the downstream searchers
|
|
||||||
searchers := make(OrderedSearcherList, len(qsearchers))
|
|
||||||
for i, searcher := range qsearchers {
|
|
||||||
searchers[i] = searcher
|
|
||||||
}
|
|
||||||
// sort the searchers
|
|
||||||
sort.Sort(sort.Reverse(searchers))
|
|
||||||
// build our searcher
|
|
||||||
rv := DisjunctionSearcher{
|
|
||||||
indexReader: indexReader,
|
|
||||||
searchers: searchers,
|
|
||||||
numSearchers: len(searchers),
|
|
||||||
currs: make([]*search.DocumentMatch, len(searchers)),
|
|
||||||
scorer: scorer.NewDisjunctionQueryScorer(options),
|
|
||||||
min: int(min),
|
|
||||||
matching: make([]*search.DocumentMatch, len(searchers)),
|
|
||||||
matchingIdxs: make([]int, len(searchers)),
|
|
||||||
}
|
|
||||||
rv.computeQueryNorm()
|
|
||||||
return &rv, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) computeQueryNorm() {
|
|
||||||
// first calculate sum of squared weights
|
|
||||||
sumOfSquaredWeights := 0.0
|
|
||||||
for _, searcher := range s.searchers {
|
|
||||||
sumOfSquaredWeights += searcher.Weight()
|
|
||||||
}
|
|
||||||
// now compute query norm from this
|
|
||||||
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
|
|
||||||
// finally tell all the downstream searchers the norm
|
|
||||||
for _, searcher := range s.searchers {
|
|
||||||
searcher.SetQueryNorm(s.queryNorm)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error {
|
|
||||||
var err error
|
|
||||||
// get all searchers pointing at their first match
|
|
||||||
for i, searcher := range s.searchers {
|
|
||||||
if s.currs[i] != nil {
|
|
||||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
|
||||||
}
|
|
||||||
s.currs[i], err = searcher.Next(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err = s.updateMatches()
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
s.initialized = true
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) updateMatches() error {
|
|
||||||
matching := s.matching[:0]
|
|
||||||
matchingIdxs := s.matchingIdxs[:0]
|
|
||||||
|
|
||||||
for i := 0; i < len(s.currs); i++ {
|
|
||||||
curr := s.currs[i]
|
|
||||||
if curr == nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if len(matching) > 0 {
|
|
||||||
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
|
|
||||||
if cmp > 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if cmp < 0 {
|
|
||||||
matching = matching[:0]
|
|
||||||
matchingIdxs = matchingIdxs[:0]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
matching = append(matching, curr)
|
|
||||||
matchingIdxs = append(matchingIdxs, i)
|
|
||||||
}
|
|
||||||
|
|
||||||
s.matching = matching
|
|
||||||
s.matchingIdxs = matchingIdxs
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) Weight() float64 {
|
|
||||||
var rv float64
|
|
||||||
for _, searcher := range s.searchers {
|
|
||||||
rv += searcher.Weight()
|
|
||||||
}
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) SetQueryNorm(qnorm float64) {
|
|
||||||
for _, searcher := range s.searchers {
|
|
||||||
searcher.SetQueryNorm(qnorm)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) Next(ctx *search.SearchContext) (
|
|
||||||
*search.DocumentMatch, error) {
|
|
||||||
if !s.initialized {
|
|
||||||
err := s.initSearchers(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
var err error
|
|
||||||
var rv *search.DocumentMatch
|
|
||||||
|
|
||||||
found := false
|
|
||||||
for !found && len(s.matching) > 0 {
|
|
||||||
if len(s.matching) >= s.min {
|
|
||||||
found = true
|
|
||||||
// score this match
|
|
||||||
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
|
|
||||||
}
|
|
||||||
|
|
||||||
// invoke next on all the matching searchers
|
|
||||||
for _, i := range s.matchingIdxs {
|
|
||||||
searcher := s.searchers[i]
|
|
||||||
if s.currs[i] != rv {
|
|
||||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
|
||||||
}
|
|
||||||
s.currs[i], err = searcher.Next(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err = s.updateMatches()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext,
|
|
||||||
ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
|
||||||
if !s.initialized {
|
|
||||||
err := s.initSearchers(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// get all searchers pointing at their first match
|
|
||||||
var err error
|
|
||||||
for i, searcher := range s.searchers {
|
|
||||||
if s.currs[i] != nil {
|
|
||||||
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
ctx.DocumentMatchPool.Put(s.currs[i])
|
|
||||||
}
|
|
||||||
s.currs[i], err = searcher.Advance(ctx, ID)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
err = s.updateMatches()
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return s.Next(ctx)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) Count() uint64 {
|
|
||||||
// for now return a worst case
|
|
||||||
var sum uint64
|
|
||||||
for _, searcher := range s.searchers {
|
|
||||||
sum += searcher.Count()
|
|
||||||
}
|
|
||||||
return sum
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) Close() (rv error) {
|
|
||||||
for _, searcher := range s.searchers {
|
|
||||||
err := searcher.Close()
|
|
||||||
if err != nil && rv == nil {
|
|
||||||
rv = err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rv
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) Min() int {
|
|
||||||
return s.min
|
|
||||||
}
|
|
||||||
|
|
||||||
func (s *DisjunctionSearcher) DocumentMatchPoolSize() int {
|
|
||||||
rv := len(s.currs)
|
|
||||||
for _, s := range s.searchers {
|
|
||||||
rv += s.DocumentMatchPoolSize()
|
|
||||||
}
|
|
||||||
return rv
|
|
||||||
}
|
}
|
||||||
|
|
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
Normal file
343
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_heap.go
generated
vendored
Normal file
|
@ -0,0 +1,343 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package searcher
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"container/heap"
|
||||||
|
"math"
|
||||||
|
"reflect"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/index"
|
||||||
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/search/scorer"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeDisjunctionHeapSearcher int
|
||||||
|
var reflectStaticSizeSearcherCurr int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var dhs DisjunctionHeapSearcher
|
||||||
|
reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size())
|
||||||
|
|
||||||
|
var sc SearcherCurr
|
||||||
|
reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size())
|
||||||
|
}
|
||||||
|
|
||||||
|
type SearcherCurr struct {
|
||||||
|
searcher search.Searcher
|
||||||
|
curr *search.DocumentMatch
|
||||||
|
}
|
||||||
|
|
||||||
|
type DisjunctionHeapSearcher struct {
|
||||||
|
indexReader index.IndexReader
|
||||||
|
|
||||||
|
numSearchers int
|
||||||
|
scorer *scorer.DisjunctionQueryScorer
|
||||||
|
min int
|
||||||
|
queryNorm float64
|
||||||
|
initialized bool
|
||||||
|
searchers []search.Searcher
|
||||||
|
heap []*SearcherCurr
|
||||||
|
|
||||||
|
matching []*search.DocumentMatch
|
||||||
|
matchingCurrs []*SearcherCurr
|
||||||
|
}
|
||||||
|
|
||||||
|
func newDisjunctionHeapSearcher(indexReader index.IndexReader,
|
||||||
|
searchers []search.Searcher, min float64, options search.SearcherOptions,
|
||||||
|
limit bool) (
|
||||||
|
*DisjunctionHeapSearcher, error) {
|
||||||
|
if limit && tooManyClauses(len(searchers)) {
|
||||||
|
return nil, tooManyClausesErr(len(searchers))
|
||||||
|
}
|
||||||
|
|
||||||
|
// build our searcher
|
||||||
|
rv := DisjunctionHeapSearcher{
|
||||||
|
indexReader: indexReader,
|
||||||
|
searchers: searchers,
|
||||||
|
numSearchers: len(searchers),
|
||||||
|
scorer: scorer.NewDisjunctionQueryScorer(options),
|
||||||
|
min: int(min),
|
||||||
|
matching: make([]*search.DocumentMatch, len(searchers)),
|
||||||
|
matchingCurrs: make([]*SearcherCurr, len(searchers)),
|
||||||
|
heap: make([]*SearcherCurr, 0, len(searchers)),
|
||||||
|
}
|
||||||
|
rv.computeQueryNorm()
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr +
|
||||||
|
s.scorer.Size()
|
||||||
|
|
||||||
|
for _, entry := range s.searchers {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range s.matching {
|
||||||
|
if entry != nil {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// for matchingCurrs and heap, just use static size * len
|
||||||
|
// since searchers and document matches already counted above
|
||||||
|
sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr
|
||||||
|
sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) computeQueryNorm() {
|
||||||
|
// first calculate sum of squared weights
|
||||||
|
sumOfSquaredWeights := 0.0
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
sumOfSquaredWeights += searcher.Weight()
|
||||||
|
}
|
||||||
|
// now compute query norm from this
|
||||||
|
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
|
||||||
|
// finally tell all the downstream searchers the norm
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
searcher.SetQueryNorm(s.queryNorm)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error {
|
||||||
|
// alloc a single block of SearcherCurrs
|
||||||
|
block := make([]SearcherCurr, len(s.searchers))
|
||||||
|
|
||||||
|
// get all searchers pointing at their first match
|
||||||
|
for i, searcher := range s.searchers {
|
||||||
|
curr, err := searcher.Next(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if curr != nil {
|
||||||
|
block[i].searcher = searcher
|
||||||
|
block[i].curr = curr
|
||||||
|
heap.Push(s, &block[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err := s.updateMatches()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
s.initialized = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) updateMatches() error {
|
||||||
|
matching := s.matching[:0]
|
||||||
|
matchingCurrs := s.matchingCurrs[:0]
|
||||||
|
|
||||||
|
if len(s.heap) > 0 {
|
||||||
|
|
||||||
|
// top of the heap is our next hit
|
||||||
|
next := heap.Pop(s).(*SearcherCurr)
|
||||||
|
matching = append(matching, next.curr)
|
||||||
|
matchingCurrs = append(matchingCurrs, next)
|
||||||
|
|
||||||
|
// now as long as top of heap matches, keep popping
|
||||||
|
for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 {
|
||||||
|
next = heap.Pop(s).(*SearcherCurr)
|
||||||
|
matching = append(matching, next.curr)
|
||||||
|
matchingCurrs = append(matchingCurrs, next)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s.matching = matching
|
||||||
|
s.matchingCurrs = matchingCurrs
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Weight() float64 {
|
||||||
|
var rv float64
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
rv += searcher.Weight()
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) {
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
searcher.SetQueryNorm(qnorm)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) (
|
||||||
|
*search.DocumentMatch, error) {
|
||||||
|
if !s.initialized {
|
||||||
|
err := s.initSearchers(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var rv *search.DocumentMatch
|
||||||
|
found := false
|
||||||
|
for !found && len(s.matching) > 0 {
|
||||||
|
if len(s.matching) >= s.min {
|
||||||
|
found = true
|
||||||
|
// score this match
|
||||||
|
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
|
||||||
|
}
|
||||||
|
|
||||||
|
// invoke next on all the matching searchers
|
||||||
|
for _, matchingCurr := range s.matchingCurrs {
|
||||||
|
if matchingCurr.curr != rv {
|
||||||
|
ctx.DocumentMatchPool.Put(matchingCurr.curr)
|
||||||
|
}
|
||||||
|
curr, err := matchingCurr.searcher.Next(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if curr != nil {
|
||||||
|
matchingCurr.curr = curr
|
||||||
|
heap.Push(s, matchingCurr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err := s.updateMatches()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext,
|
||||||
|
ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
||||||
|
if !s.initialized {
|
||||||
|
err := s.initSearchers(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// if there is anything in matching, toss it back onto the heap
|
||||||
|
for _, matchingCurr := range s.matchingCurrs {
|
||||||
|
heap.Push(s, matchingCurr)
|
||||||
|
}
|
||||||
|
s.matching = s.matching[:0]
|
||||||
|
s.matchingCurrs = s.matchingCurrs[:0]
|
||||||
|
|
||||||
|
// find all searchers that actually need to be advanced
|
||||||
|
// advance them, using s.matchingCurrs as temp storage
|
||||||
|
for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 {
|
||||||
|
searcherCurr := heap.Pop(s).(*SearcherCurr)
|
||||||
|
ctx.DocumentMatchPool.Put(searcherCurr.curr)
|
||||||
|
curr, err := searcherCurr.searcher.Advance(ctx, ID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if curr != nil {
|
||||||
|
searcherCurr.curr = curr
|
||||||
|
s.matchingCurrs = append(s.matchingCurrs, searcherCurr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// now all of the searchers that we advanced have to be pushed back
|
||||||
|
for _, matchingCurr := range s.matchingCurrs {
|
||||||
|
heap.Push(s, matchingCurr)
|
||||||
|
}
|
||||||
|
// reset our temp space
|
||||||
|
s.matchingCurrs = s.matchingCurrs[:0]
|
||||||
|
|
||||||
|
err := s.updateMatches()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return s.Next(ctx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Count() uint64 {
|
||||||
|
// for now return a worst case
|
||||||
|
var sum uint64
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
sum += searcher.Count()
|
||||||
|
}
|
||||||
|
return sum
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Close() (rv error) {
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
err := searcher.Close()
|
||||||
|
if err != nil && rv == nil {
|
||||||
|
rv = err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Min() int {
|
||||||
|
return s.min
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int {
|
||||||
|
rv := len(s.searchers)
|
||||||
|
for _, s := range s.searchers {
|
||||||
|
rv += s.DocumentMatchPoolSize()
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
// a disjunction searcher implements the index.Optimizable interface
|
||||||
|
// but only activates on an edge case where the disjunction is a
|
||||||
|
// wrapper around a single Optimizable child searcher
|
||||||
|
func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) (
|
||||||
|
index.OptimizableContext, error) {
|
||||||
|
if len(s.searchers) == 1 {
|
||||||
|
o, ok := s.searchers[0].(index.Optimizable)
|
||||||
|
if ok {
|
||||||
|
return o.Optimize(kind, octx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return octx, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// heap impl
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) }
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Less(i, j int) bool {
|
||||||
|
if s.heap[i].curr == nil {
|
||||||
|
return true
|
||||||
|
} else if s.heap[j].curr == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Swap(i, j int) {
|
||||||
|
s.heap[i], s.heap[j] = s.heap[j], s.heap[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Push(x interface{}) {
|
||||||
|
s.heap = append(s.heap, x.(*SearcherCurr))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionHeapSearcher) Pop() interface{} {
|
||||||
|
old := s.heap
|
||||||
|
n := len(old)
|
||||||
|
x := old[n-1]
|
||||||
|
s.heap = old[0 : n-1]
|
||||||
|
return x
|
||||||
|
}
|
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
Normal file
298
vendor/github.com/blevesearch/bleve/search/searcher/search_disjunction_slice.go
generated
vendored
Normal file
|
@ -0,0 +1,298 @@
|
||||||
|
// Copyright (c) 2018 Couchbase, Inc.
|
||||||
|
//
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package searcher
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"reflect"
|
||||||
|
"sort"
|
||||||
|
|
||||||
|
"github.com/blevesearch/bleve/index"
|
||||||
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/search/scorer"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeDisjunctionSliceSearcher int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var ds DisjunctionSliceSearcher
|
||||||
|
reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size())
|
||||||
|
}
|
||||||
|
|
||||||
|
type DisjunctionSliceSearcher struct {
|
||||||
|
indexReader index.IndexReader
|
||||||
|
searchers OrderedSearcherList
|
||||||
|
numSearchers int
|
||||||
|
queryNorm float64
|
||||||
|
currs []*search.DocumentMatch
|
||||||
|
scorer *scorer.DisjunctionQueryScorer
|
||||||
|
min int
|
||||||
|
matching []*search.DocumentMatch
|
||||||
|
matchingIdxs []int
|
||||||
|
initialized bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func newDisjunctionSliceSearcher(indexReader index.IndexReader,
|
||||||
|
qsearchers []search.Searcher, min float64, options search.SearcherOptions,
|
||||||
|
limit bool) (
|
||||||
|
*DisjunctionSliceSearcher, error) {
|
||||||
|
if limit && tooManyClauses(len(qsearchers)) {
|
||||||
|
return nil, tooManyClausesErr(len(qsearchers))
|
||||||
|
}
|
||||||
|
// build the downstream searchers
|
||||||
|
searchers := make(OrderedSearcherList, len(qsearchers))
|
||||||
|
for i, searcher := range qsearchers {
|
||||||
|
searchers[i] = searcher
|
||||||
|
}
|
||||||
|
// sort the searchers
|
||||||
|
sort.Sort(sort.Reverse(searchers))
|
||||||
|
// build our searcher
|
||||||
|
rv := DisjunctionSliceSearcher{
|
||||||
|
indexReader: indexReader,
|
||||||
|
searchers: searchers,
|
||||||
|
numSearchers: len(searchers),
|
||||||
|
currs: make([]*search.DocumentMatch, len(searchers)),
|
||||||
|
scorer: scorer.NewDisjunctionQueryScorer(options),
|
||||||
|
min: int(min),
|
||||||
|
matching: make([]*search.DocumentMatch, len(searchers)),
|
||||||
|
matchingIdxs: make([]int, len(searchers)),
|
||||||
|
}
|
||||||
|
rv.computeQueryNorm()
|
||||||
|
return &rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Size() int {
|
||||||
|
sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr +
|
||||||
|
s.scorer.Size()
|
||||||
|
|
||||||
|
for _, entry := range s.searchers {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range s.currs {
|
||||||
|
if entry != nil {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, entry := range s.matching {
|
||||||
|
if entry != nil {
|
||||||
|
sizeInBytes += entry.Size()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt
|
||||||
|
|
||||||
|
return sizeInBytes
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) computeQueryNorm() {
|
||||||
|
// first calculate sum of squared weights
|
||||||
|
sumOfSquaredWeights := 0.0
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
sumOfSquaredWeights += searcher.Weight()
|
||||||
|
}
|
||||||
|
// now compute query norm from this
|
||||||
|
s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights)
|
||||||
|
// finally tell all the downstream searchers the norm
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
searcher.SetQueryNorm(s.queryNorm)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error {
|
||||||
|
var err error
|
||||||
|
// get all searchers pointing at their first match
|
||||||
|
for i, searcher := range s.searchers {
|
||||||
|
if s.currs[i] != nil {
|
||||||
|
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||||
|
}
|
||||||
|
s.currs[i], err = searcher.Next(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = s.updateMatches()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
s.initialized = true
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) updateMatches() error {
|
||||||
|
matching := s.matching[:0]
|
||||||
|
matchingIdxs := s.matchingIdxs[:0]
|
||||||
|
|
||||||
|
for i := 0; i < len(s.currs); i++ {
|
||||||
|
curr := s.currs[i]
|
||||||
|
if curr == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(matching) > 0 {
|
||||||
|
cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID)
|
||||||
|
if cmp > 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if cmp < 0 {
|
||||||
|
matching = matching[:0]
|
||||||
|
matchingIdxs = matchingIdxs[:0]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
matching = append(matching, curr)
|
||||||
|
matchingIdxs = append(matchingIdxs, i)
|
||||||
|
}
|
||||||
|
|
||||||
|
s.matching = matching
|
||||||
|
s.matchingIdxs = matchingIdxs
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Weight() float64 {
|
||||||
|
var rv float64
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
rv += searcher.Weight()
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) {
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
searcher.SetQueryNorm(qnorm)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) (
|
||||||
|
*search.DocumentMatch, error) {
|
||||||
|
if !s.initialized {
|
||||||
|
err := s.initSearchers(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var err error
|
||||||
|
var rv *search.DocumentMatch
|
||||||
|
|
||||||
|
found := false
|
||||||
|
for !found && len(s.matching) > 0 {
|
||||||
|
if len(s.matching) >= s.min {
|
||||||
|
found = true
|
||||||
|
// score this match
|
||||||
|
rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers)
|
||||||
|
}
|
||||||
|
|
||||||
|
// invoke next on all the matching searchers
|
||||||
|
for _, i := range s.matchingIdxs {
|
||||||
|
searcher := s.searchers[i]
|
||||||
|
if s.currs[i] != rv {
|
||||||
|
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||||
|
}
|
||||||
|
s.currs[i], err = searcher.Next(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = s.updateMatches()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rv, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext,
|
||||||
|
ID index.IndexInternalID) (*search.DocumentMatch, error) {
|
||||||
|
if !s.initialized {
|
||||||
|
err := s.initSearchers(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// get all searchers pointing at their first match
|
||||||
|
var err error
|
||||||
|
for i, searcher := range s.searchers {
|
||||||
|
if s.currs[i] != nil {
|
||||||
|
if s.currs[i].IndexInternalID.Compare(ID) >= 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ctx.DocumentMatchPool.Put(s.currs[i])
|
||||||
|
}
|
||||||
|
s.currs[i], err = searcher.Advance(ctx, ID)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = s.updateMatches()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return s.Next(ctx)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Count() uint64 {
|
||||||
|
// for now return a worst case
|
||||||
|
var sum uint64
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
sum += searcher.Count()
|
||||||
|
}
|
||||||
|
return sum
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Close() (rv error) {
|
||||||
|
for _, searcher := range s.searchers {
|
||||||
|
err := searcher.Close()
|
||||||
|
if err != nil && rv == nil {
|
||||||
|
rv = err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) Min() int {
|
||||||
|
return s.min
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int {
|
||||||
|
rv := len(s.currs)
|
||||||
|
for _, s := range s.searchers {
|
||||||
|
rv += s.DocumentMatchPoolSize()
|
||||||
|
}
|
||||||
|
return rv
|
||||||
|
}
|
||||||
|
|
||||||
|
// a disjunction searcher implements the index.Optimizable interface
|
||||||
|
// but only activates on an edge case where the disjunction is a
|
||||||
|
// wrapper around a single Optimizable child searcher
|
||||||
|
func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) (
|
||||||
|
index.OptimizableContext, error) {
|
||||||
|
if len(s.searchers) == 1 {
|
||||||
|
o, ok := s.searchers[0].(index.Optimizable)
|
||||||
|
if ok {
|
||||||
|
return o.Optimize(kind, octx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return octx, nil
|
||||||
|
}
|
|
@ -15,11 +15,21 @@
|
||||||
package searcher
|
package searcher
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
"github.com/blevesearch/bleve/search/scorer"
|
"github.com/blevesearch/bleve/search/scorer"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeDocIDSearcher int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var ds DocIDSearcher
|
||||||
|
reflectStaticSizeDocIDSearcher = int(reflect.TypeOf(ds).Size())
|
||||||
|
}
|
||||||
|
|
||||||
// DocIDSearcher returns documents matching a predefined set of identifiers.
|
// DocIDSearcher returns documents matching a predefined set of identifiers.
|
||||||
type DocIDSearcher struct {
|
type DocIDSearcher struct {
|
||||||
reader index.DocIDReader
|
reader index.DocIDReader
|
||||||
|
@ -42,6 +52,12 @@ func NewDocIDSearcher(indexReader index.IndexReader, ids []string, boost float64
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *DocIDSearcher) Size() int {
|
||||||
|
return reflectStaticSizeDocIDSearcher + size.SizeOfPtr +
|
||||||
|
s.reader.Size() +
|
||||||
|
s.scorer.Size()
|
||||||
|
}
|
||||||
|
|
||||||
func (s *DocIDSearcher) Count() uint64 {
|
func (s *DocIDSearcher) Count() uint64 {
|
||||||
return uint64(s.count)
|
return uint64(s.count)
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,10 +15,20 @@
|
||||||
package searcher
|
package searcher
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"reflect"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
|
"github.com/blevesearch/bleve/size"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var reflectStaticSizeFilteringSearcher int
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
var fs FilteringSearcher
|
||||||
|
reflectStaticSizeFilteringSearcher = int(reflect.TypeOf(fs).Size())
|
||||||
|
}
|
||||||
|
|
||||||
// FilterFunc defines a function which can filter documents
|
// FilterFunc defines a function which can filter documents
|
||||||
// returning true means keep the document
|
// returning true means keep the document
|
||||||
// returning false means do not keep the document
|
// returning false means do not keep the document
|
||||||
|
@ -38,6 +48,11 @@ func NewFilteringSearcher(s search.Searcher, filter FilterFunc) *FilteringSearch
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f *FilteringSearcher) Size() int {
|
||||||
|
return reflectStaticSizeFilteringSearcher + size.SizeOfPtr +
|
||||||
|
f.child.Size()
|
||||||
|
}
|
||||||
|
|
||||||
func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) {
|
func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) {
|
||||||
next, err := f.child.Next(ctx)
|
next, err := f.child.Next(ctx)
|
||||||
for next != nil && err == nil {
|
for next != nil && err == nil {
|
||||||
|
|
|
@ -15,13 +15,26 @@
|
||||||
package searcher
|
package searcher
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
"github.com/blevesearch/bleve/index"
|
"github.com/blevesearch/bleve/index"
|
||||||
"github.com/blevesearch/bleve/search"
|
"github.com/blevesearch/bleve/search"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var MaxFuzziness = 2
|
||||||
|
|
||||||
func NewFuzzySearcher(indexReader index.IndexReader, term string,
|
func NewFuzzySearcher(indexReader index.IndexReader, term string,
|
||||||
prefix, fuzziness int, field string, boost float64,
|
prefix, fuzziness int, field string, boost float64,
|
||||||
options search.SearcherOptions) (search.Searcher, error) {
|
options search.SearcherOptions) (search.Searcher, error) {
|
||||||
|
|
||||||
|
if fuzziness > MaxFuzziness {
|
||||||
|
return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness)
|
||||||
|
}
|
||||||
|
|
||||||
|
if fuzziness < 0 {
|
||||||
|
return nil, fmt.Errorf("invalid fuzziness, negative")
|
||||||
|
}
|
||||||
|
|
||||||
// Note: we don't byte slice the term for a prefix because of runes.
|
// Note: we don't byte slice the term for a prefix because of runes.
|
||||||
prefixTerm := ""
|
prefixTerm := ""
|
||||||
for i, r := range term {
|
for i, r := range term {
|
||||||
|
@ -31,7 +44,6 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
|
candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness,
|
||||||
field, prefixTerm)
|
field, prefixTerm)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -45,12 +57,40 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string,
|
||||||
func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
|
func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
|
||||||
fuzziness int, field, prefixTerm string) (rv []string, err error) {
|
fuzziness int, field, prefixTerm string) (rv []string, err error) {
|
||||||
rv = make([]string, 0)
|
rv = make([]string, 0)
|
||||||
|
|
||||||
|
// in case of advanced reader implementations directly call
|
||||||
|
// the levenshtein automaton based iterator to collect the
|
||||||
|
// candidate terms
|
||||||
|
if ir, ok := indexReader.(index.IndexReaderFuzzy); ok {
|
||||||
|
fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if cerr := fieldDict.Close(); cerr != nil && err == nil {
|
||||||
|
err = cerr
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
tfd, err := fieldDict.Next()
|
||||||
|
for err == nil && tfd != nil {
|
||||||
|
rv = append(rv, tfd.Term)
|
||||||
|
if tooManyClauses(len(rv)) {
|
||||||
|
return nil, tooManyClausesErr(len(rv))
|
||||||
|
}
|
||||||
|
tfd, err = fieldDict.Next()
|
||||||
|
}
|
||||||
|
return rv, err
|
||||||
|
}
|
||||||
|
|
||||||
var fieldDict index.FieldDict
|
var fieldDict index.FieldDict
|
||||||
if len(prefixTerm) > 0 {
|
if len(prefixTerm) > 0 {
|
||||||
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
|
fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm))
|
||||||
} else {
|
} else {
|
||||||
fieldDict, err = indexReader.FieldDict(field)
|
fieldDict, err = indexReader.FieldDict(field)
|
||||||
}
|
}
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
if cerr := fieldDict.Close(); cerr != nil && err == nil {
|
if cerr := fieldDict.Close(); cerr != nil && err == nil {
|
||||||
err = cerr
|
err = cerr
|
||||||
|
@ -58,13 +98,16 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string,
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// enumerate terms and check levenshtein distance
|
// enumerate terms and check levenshtein distance
|
||||||
|
var reuse []int
|
||||||
tfd, err := fieldDict.Next()
|
tfd, err := fieldDict.Next()
|
||||||
for err == nil && tfd != nil {
|
for err == nil && tfd != nil {
|
||||||
ld, exceeded := search.LevenshteinDistanceMax(term, tfd.Term, fuzziness)
|
var ld int
|
||||||
|
var exceeded bool
|
||||||
|
ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse)
|
||||||
if !exceeded && ld <= fuzziness {
|
if !exceeded && ld <= fuzziness {
|
||||||
rv = append(rv, tfd.Term)
|
rv = append(rv, tfd.Term)
|
||||||
if tooManyClauses(len(rv)) {
|
if tooManyClauses(len(rv)) {
|
||||||
return rv, tooManyClausesErr()
|
return nil, tooManyClausesErr(len(rv))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
tfd, err = fieldDict.Next()
|
tfd, err = fieldDict.Next()
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue