2021SC@SDUSC
.dvm是存放了DocValue域的元数据,比如DocValue偏移量。
.dvd则存放了DocValue的数据。
在Solr4.8.0中,dvd以及dvm用到的Lucene编码格式是Lucene45DocValuesFormat。跟之前的文件格式类似,它分别包含Lucene45DocValuesProducer
和Lucene45DocValuesConsumer来实现该文件的读和写。
@Override public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException { return new Lucene45DocValuesConsumer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); } @Override public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException { return new Lucene45DocValuesProducer(state, DATA_CODEC, DATA_EXTENSION, META_CODEC, META_EXTENSION); }
Lucene 4.5 DocValues format通过下面的策略对四种类型进行编码:
NUMERIC
BINARY
Sorted:
SortedSet:
首先来介绍下.dvm的文件格式:
. dvm的文件结构分为好多层:
第一层:.dvm 由Header,NumFields,Footer
第二层:Entry具有四种类型,NumericEntry | BinaryEntry | SortedEntry | SortedSetEntry
第三层:
SortedEntry: 包含FieldNumber,EntryType,BinaryEntry,NumericEntry
SortedSetEntry: 包含EntryType,BinaryEntry,NumericEntry,NumericEntry
同样.dvd 文件具有好几层结构:
第一层:Header,<NumericData | BinaryData | SortedData>NumFields,Footer 与dvm类似,NumFields个Data(SortedData,BinaryData,NumericData其中一个)
第二层:
第三层:
SortedSet入口储存了BinaryData中的序号的列表,使用一个增长的vLong类型的序列,并用差值编码。
前文讲到Lucene45DocValuesFormat分别包含Lucene45DocValuesProducer和Lucene45DocValuesConsumer来实现该文件的读和写,那么本节内容主要以Lucene45DocValuesProducer为例来学习下dvm和dvd。
首先学习下Lucene45DocValuesProducer的初始化:主要作用是读取.dvm文件和.dvd流。其中在读取.dvm文件过程中,Lucene45DocValuesProducer调用了readFields(in, state.fieldInfos)来获取入口信息。
protected Lucene45DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException { //.dvm文件名 String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension); // read in the entries from the metadata file. //打开.dvm并获取检验和,获取文件流, ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context); //获取segment的document个数 this.maxDoc = state.segmentInfo.getDocCount(); boolean success = false; try { //获取.dvm header version = CodecUtil.checkHeader(in, metaCodec, Lucene45DocValuesFormat.VERSION_START, Lucene45DocValuesFormat.VERSION_CURRENT); numerics = new HashMap<>(); ords = new HashMap<>(); ordIndexes = new HashMap<>(); binaries = new HashMap<>(); sortedSets = new HashMap<>(); //读取NumFields个<Entry> readFields(in, state.fieldInfos); //加入Footer if (version >= Lucene45DocValuesFormat.VERSION_CHECKSUM) { CodecUtil.checkFooter(in); } else { CodecUtil.checkEOF(in); } success = true; } finally { if (success) { IOUtils.close(in); } else { IOUtils.closeWhileHandlingException(in); } } success = false; try { //.dvd文件名 String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension); //打开.dvd文件 data = state.directory.openInput(dataName, state.context); //获取.dvd header final int version2 = CodecUtil.checkHeader(data, dataCodec, Lucene45DocValuesFormat.VERSION_START, Lucene45DocValuesFormat.VERSION_CURRENT); if (version != version2) { throw new CorruptIndexException("Format versions mismatch"); } success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(this.data); } } //估算类的大小,也就是估算.dvd流的大小 ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass())); }
readFields(in, state.fieldInfos)主要是读取EntryType,根据它的值来选择哪种方式来读取后续的Entry信息,
函数中涉及了以下几个方式:
2.BinaryEntry类型readBinaryEntry()
3.SortedSetEntry类型readSortedField()
4.SortedSetEntry类型readSortedSetEntry(),同时在该类型下,readFields还分别调用了readSortedSetFieldWithAddresses和readSortedField
private void readFields(IndexInput meta, FieldInfos infos) throws IOException { //读取Entry的编号,如果编号为-1,表示这是最后一个Entry。 int fieldNumber = meta.readVInt(); while (fieldNumber != -1) { // check should be: infos.fieldInfo(fieldNumber) != null, which incorporates negative check // but docvalues updates are currently buggy here (loading extra stuff, etc): LUCENE-5616 if (fieldNumber < 0) { // trickier to validate more: because we re-use for norms, because we use multiple entries // for "composite" types like sortedset, etc. throw new CorruptIndexException("Invalid field number: " + fieldNumber + " (resource=" + meta + ")"); } //读取EntryType,以此来区分Entry的类型,0表示NUMERICENTRY,1表示BINARYENTRY,2表示SORTEDENTRY,3表示SORTED_SETENTRY byte type = meta.readByte(); if (type == Lucene45DocValuesFormat.NUMERIC) { //获取具体的NumericEntry内容,并放入以编号为键,NumericEntry为值的map中 numerics.put(fieldNumber, readNumericEntry(meta)); } else if (type == Lucene45DocValuesFormat.BINARY) { //获取具体的BinaryEntry内容,并放入以编号为键,BinaryEntry为值的map中 BinaryEntry b = readBinaryEntry(meta); binaries.put(fieldNumber, b); } else if (type == Lucene45DocValuesFormat.SORTED) { //读取SortedEntry readSortedField(fieldNumber, meta, infos); } else if (type == Lucene45DocValuesFormat.SORTED_SET) { //读取SortedSetEntry,并放入以编号为键,SortedSetEntry为值的map中 SortedSetEntry ss = readSortedSetEntry(meta); sortedSets.put(fieldNumber, ss); //标准的存储有序的集合是否通过address的间接转换,SORTED_SET_WITH_ADDRESSES是docid->address>ord映射 if (ss.format == SORTED_SET_WITH_ADDRESSES) { readSortedSetFieldWithAddresses(fieldNumber, meta, infos); //SORTED_SET_SINGLE_VALUED_SORTED 只存储docid->ord的值 } else if (ss.format == SORTED_SET_SINGLE_VALUED_SORTED) { if (meta.readVInt() != fieldNumber) { throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } if (meta.readByte() != Lucene45DocValuesFormat.SORTED) { throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } readSortedField(fieldNumber, meta, infos); } else { throw new AssertionError(); } } else { throw new CorruptIndexException("invalid type: " + type + ", resource=" + meta); } //读取下一个Entry fieldNumber = meta.readVInt(); } }
static NumericEntry readNumericEntry(IndexInput meta) throws IOException { NumericEntry entry = new NumericEntry(); entry.format = meta.readVInt(); //NumericType,Numeric的编码方式 entry.missingOffset = meta.readLong(); //MissingOffset表示该field在哪个document中缺失,如果为-1表示没有document缺失字段 entry.packedIntsVersion = meta.readVInt(); //PackedVersion 打包整数的version entry.offset = meta.readLong(); //DataOffset 指向.dvd文件中数据起始位置的指针 entry.count = meta.readVLong(); //Count 已写的值的个数 entry.blockSize = meta.readVInt(); //BlockSize 已打包的整数的大小 switch(entry.format) { case GCD_COMPRESSED: //GCD-compressed(最大公约数压缩) entry.minValue = meta.readLong(); //MinValue entry.gcd = meta.readLong(); //GCD break; case TABLE_COMPRESSED: //Table-compressed(表压缩) if (entry.count > Integer.MAX_VALUE) { throw new CorruptIndexException("Cannot use TABLE_COMPRESSED with more than MAX_VALUE values, input=" + meta); } final int uniqueValues = meta.readVInt(); //TableSize if (uniqueValues > 256) { //TableSize必须小于256 throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, input=" + meta); } entry.table = new long[uniqueValues]; //TableSize个Long for (int i = 0; i < uniqueValues; ++i) { entry.table[i] = meta.readLong(); } break; case DELTA_COMPRESSED: //Delta-compressed(增量压缩) break; default: throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); } return entry; }
static BinaryEntry readBinaryEntry(IndexInput meta) throws IOException { BinaryEntry entry = new BinaryEntry(); entry.format = meta.readVInt(); //BinaryType类型 entry.missingOffset = meta.readLong(); //缺失表示,同NuericEntry entry.minLength = meta.readVInt(); //存储Binary 类型的值的位数组的长度的最小值和最大值。 //如果这两个值是相等的,那么所有的值都是固定的大小, //并且可以通过DataOffset + (docID * length)计算出来。 //否则,Binary的值是不定长的 entry.maxLength = meta.readVInt(); entry.count = meta.readVLong(); entry.offset = meta.readLong(); //实际二进制数的偏移 switch(entry.format) { case BINARY_FIXED_UNCOMPRESSED: //Fixed-width Binary break; case BINARY_PREFIX_COMPRESSED: //Variable-width Binary entry.addressInterval = meta.readVInt(); entry.addressesOffset = meta.readLong(); entry.packedIntsVersion = meta.readVInt(); entry.blockSize = meta.readVInt(); break; case BINARY_VARIABLE_UNCOMPRESSED: //Prefix-compressed Binary entry.addressesOffset = meta.readLong(); entry.packedIntsVersion = meta.readVInt(); entry.blockSize = meta.readVInt(); break; default: throw new CorruptIndexException("Unknown format: " + entry.format + ", input=" + meta); } return entry; }
private void readSortedSetFieldWithAddresses(int fieldNumber, IndexInput meta, FieldInfos infos) throws IOException { // sortedset = binary + numeric (addresses) + ordIndex if (meta.readVInt() != fieldNumber) { throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } BinaryEntry b = readBinaryEntry(meta); binaries.put(fieldNumber, b); if (meta.readVInt() != fieldNumber) { throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } NumericEntry n1 = readNumericEntry(meta); ords.put(fieldNumber, n1); if (meta.readVInt() != fieldNumber) { throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { throw new CorruptIndexException("sortedset entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } NumericEntry n2 = readNumericEntry(meta); ordIndexes.put(fieldNumber, n2); }
private void readSortedField(int fieldNumber, IndexInput meta, FieldInfos infos) throws IOException { // sorted = binary + numeric if (meta.readVInt() != fieldNumber) { throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } if (meta.readByte() != Lucene45DocValuesFormat.BINARY) { throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } BinaryEntry b = readBinaryEntry(meta); binaries.put(fieldNumber, b); if (meta.readVInt() != fieldNumber) { throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } if (meta.readByte() != Lucene45DocValuesFormat.NUMERIC) { throw new CorruptIndexException("sorted entry for field: " + fieldNumber + " is corrupt (resource=" + meta + ")"); } NumericEntry n = readNumericEntry(meta); ords.put(fieldNumber, n); }
上文讲了.dvm文件的读取, 那么接下来学习下怎么对.dvd文件的读取。