From 3b5b2a6286df48a4ab471aece74bc7b7947042ad Mon Sep 17 00:00:00 2001 From: sychen Date: Fri, 29 Mar 2024 10:57:10 -0700 Subject: [PATCH] ORC-1667: Add `check` tool to check the index of the specified column ### What changes were proposed in this pull request? This PR aims to check the index of the specified column. We can test the filtering effect by specifying different types. `check --type stat` - Only use column statistics. `check --type bloom-filter` - Only use bloom filter. `check --type predicate` - Used in combination with column statistics and bloom filter. ### Why are the changes needed? ORC supports specifying multiple columns to generate bloom filter indexes, but it lacks a convenient tool to verify the effect of bloom filter. Parquet also has similar commands. [PARQUET-2138](https://issues.apache.org/jira/browse/PARQUET-2138): Add ShowBloomFilterCommand to parquet-cli ### How was this patch tested? Add UT ### Was this patch authored or co-authored using generative AI tooling? No Closes #1862 from cxzl25/ORC-1667. Authored-by: sychen Signed-off-by: Dongjoon Hyun --- .../java/org/apache/orc/tools/CheckTool.java | 336 ++++++++++++++++++ .../src/java/org/apache/orc/tools/Driver.java | 4 + .../orc/tools/bloomfilter/TestCheckTool.java | 212 +++++++++++ site/_docs/java-tools.md | 19 + 4 files changed, 571 insertions(+) create mode 100644 java/tools/src/java/org/apache/orc/tools/CheckTool.java create mode 100644 java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java diff --git a/java/tools/src/java/org/apache/orc/tools/CheckTool.java b/java/tools/src/java/org/apache/orc/tools/CheckTool.java new file mode 100644 index 0000000000..2d90241bd8 --- /dev/null +++ b/java/tools/src/java/org/apache/orc/tools/CheckTool.java @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.tools; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgument; +import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl; +import org.apache.orc.ColumnStatistics; +import org.apache.orc.OrcFile; +import org.apache.orc.OrcProto; +import org.apache.orc.OrcUtils; +import org.apache.orc.Reader; +import org.apache.orc.StripeInformation; +import org.apache.orc.TypeDescription; +import org.apache.orc.impl.ColumnStatisticsImpl; +import org.apache.orc.impl.OrcIndex; +import org.apache.orc.impl.RecordReaderImpl; +import org.apache.orc.util.BloomFilter; +import org.apache.orc.util.BloomFilterIO; + +import java.util.ArrayList; +import java.util.List; + +/** + * Check whether the specified column of multiple ORC files can filter the specified value. + */ +public class CheckTool { + + private static final String CHECK_TYPE_PREDICATE = "predicate"; + private static final String CHECK_TYPE_STAT = "stat"; + private static final String CHECK_TYPE_BLOOM_FILTER = "bloom-filter"; + + public static void main(Configuration conf, String[] args) throws Exception { + Options opts = createOptions(); + CommandLine cli = new DefaultParser().parse(opts, args); + HelpFormatter formatter = new HelpFormatter(); + if (cli.hasOption('h')) { + formatter.printHelp("check", opts); + return; + } + + String type = cli.getOptionValue("type"); + if (type == null || + (!type.equals(CHECK_TYPE_PREDICATE) && + !type.equals(CHECK_TYPE_STAT) && + !type.equals(CHECK_TYPE_BLOOM_FILTER))) { + System.err.printf("type %s not support %n", type); + formatter.printHelp("check", opts); + return; + } + String column = cli.getOptionValue("column"); + if (column == null || column.isEmpty()) { + System.err.println("column is null"); + formatter.printHelp("check", opts); + return; + } + String[] values = cli.getOptionValues("values"); + if (values == null || values.length == 0) { + System.err.println("values is null"); + formatter.printHelp("check", opts); + return; + } + boolean ignoreExtension = cli.hasOption("ignoreExtension"); + + List inputFiles = new ArrayList<>(); + String[] files = cli.getArgs(); + for (String root : files) { + Path rootPath = new Path(root); + FileSystem fs = rootPath.getFileSystem(conf); + for (RemoteIterator itr = fs.listFiles(rootPath, true); itr.hasNext(); ) { + LocatedFileStatus status = itr.next(); + if (status.isFile() && (ignoreExtension || status.getPath().getName().endsWith(".orc"))) { + inputFiles.add(status.getPath()); + } + } + } + if (inputFiles.isEmpty()) { + System.err.println("No files found."); + System.exit(1); + } + + for (Path inputFile : inputFiles) { + System.out.println("input file: " + inputFile); + FileSystem fs = inputFile.getFileSystem(conf); + try (Reader reader = OrcFile.createReader(inputFile, + OrcFile.readerOptions(conf).filesystem(fs))) { + RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); + TypeDescription schema = reader.getSchema(); + boolean[] includedColumns = OrcUtils.includeColumns(column, schema); + int colIndex = -1; + for (int i = 0; i < includedColumns.length; i++) { + if (includedColumns[i]) { + colIndex = i; + break; + } + } + if (colIndex == -1) { + System.err.printf("column: %s not found in file: %s%n", column, inputFile); + continue; + } + int stripeIndex = -1; + for (StripeInformation stripe : reader.getStripes()) { + ++stripeIndex; + + OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); + + OrcProto.ColumnEncoding columnEncoding = footer.getColumns(colIndex); + TypeDescription subtype = reader.getSchema().findSubtype(colIndex); + TypeDescription.Category columnCategory = subtype.getCategory(); + OrcIndex indices = rows.readRowIndex(stripeIndex, null, includedColumns); + if (type.equals(CHECK_TYPE_BLOOM_FILTER)) { + checkBloomFilter(inputFile, reader, indices, stripeIndex, + colIndex, column, columnEncoding, columnCategory, values); + } else { + checkStatOrPredicate(inputFile, reader, indices, stripeIndex, + colIndex, column, columnEncoding, subtype, columnCategory, values, type); + } + } + } + } + } + + private static void checkStatOrPredicate(Path inputFile, + Reader reader, + OrcIndex indices, + int stripeIndex, + int colIndex, + String column, + OrcProto.ColumnEncoding columnEncoding, + TypeDescription subtype, + TypeDescription.Category columnCategory, + String[] values, + String type) { + OrcProto.RowIndex rowGroupIndex = indices.getRowGroupIndex()[colIndex]; + int entryCount = rowGroupIndex.getEntryCount(); + boolean hasBloomFilter = true; + OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex(); + OrcProto.BloomFilterIndex bloomFilterIndex = bloomFilterIndices[colIndex]; + if (bloomFilterIndex == null || bloomFilterIndex.getBloomFilterList().isEmpty()) { + hasBloomFilter = false; + } + for (int i = 0; i < entryCount; i++) { + OrcProto.ColumnStatistics statistics = rowGroupIndex.getEntry(i).getStatistics(); + ColumnStatistics cs = ColumnStatisticsImpl.deserialize(subtype, + statistics, + reader.writerUsedProlepticGregorian(), + reader.getConvertToProlepticGregorian()); + + BloomFilter bloomFilter = null; + if (type.equals(CHECK_TYPE_PREDICATE) && hasBloomFilter) { + bloomFilter = BloomFilterIO.deserialize( + indices.getBloomFilterKinds()[colIndex], columnEncoding, + reader.getWriterVersion(), columnCategory, bloomFilterIndex.getBloomFilter(i)); + } + + for (String value : values) { + PredicateLeaf predicateLeaf = createPredicateLeaf(PredicateLeaf.Operator.EQUALS, + getPredicateLeafType(columnCategory), column, convert(columnCategory, value)); + SearchArgument.TruthValue truthValue = RecordReaderImpl.evaluatePredicate( + cs, predicateLeaf, bloomFilter); + System.out.printf("stripe: %d, rowIndex: %d, value: %s, test value: %s%n", + stripeIndex, i, value, truthValue); + } + } + } + + private static void checkBloomFilter(Path inputFile, + Reader reader, + OrcIndex indices, + int stripeIndex, + int colIndex, + String column, + OrcProto.ColumnEncoding columnEncoding, + TypeDescription.Category columnCategory, + String[] values) { + OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex(); + OrcProto.BloomFilterIndex bloomFilterIndex = bloomFilterIndices[colIndex]; + if (bloomFilterIndex == null || bloomFilterIndex.getBloomFilterList().isEmpty()) { + System.err.printf("The bloom filter index for column: %s is not found in file: %s%n", + column, inputFile); + return; + } + List bloomFilterList = bloomFilterIndex.getBloomFilterList(); + for (int i = 0; i < bloomFilterList.size(); i++) { + OrcProto.BloomFilter bf = bloomFilterList.get(i); + org.apache.orc.util.BloomFilter bloomFilter = BloomFilterIO.deserialize( + indices.getBloomFilterKinds()[colIndex], columnEncoding, + reader.getWriterVersion(), columnCategory, bf); + for (String value : values) { + boolean testResult = test(bloomFilter, columnCategory, value); + if (testResult) { + System.out.printf("stripe: %d, rowIndex: %d, value: %s, bloom filter: maybe exist%n", + stripeIndex, i, value); + } else { + System.out.printf("stripe: %d, rowIndex: %d, value: %s, bloom filter: not exist%n", + stripeIndex, i, value); + } + } + } + } + + private static boolean test(BloomFilter bloomFilter, + TypeDescription.Category columnCategory, String value) { + switch (columnCategory){ + case BYTE: + case SHORT: + case INT: + case LONG: + case DATE: + case TIMESTAMP: + return bloomFilter.testLong(Long.parseLong(value)); + case FLOAT: + case DOUBLE: + return bloomFilter.testDouble(Double.parseDouble(value)); + case STRING: + case CHAR: + case VARCHAR: + case DECIMAL: + return bloomFilter.testString(value); + default: + throw new IllegalStateException("Not supported type:" + columnCategory); + } + } + + private static Object convert( + TypeDescription.Category columnCategory, String value) { + switch (columnCategory) { + case BYTE: + case SHORT: + case INT: + case LONG: + case DATE: + case TIMESTAMP: + return Long.parseLong(value); + case FLOAT: + case DOUBLE: + return Double.parseDouble(value); + case STRING: + case CHAR: + case VARCHAR: + case DECIMAL: + return value; + default: + throw new IllegalStateException("Not supported type:" + columnCategory); + } + } + + private static PredicateLeaf.Type getPredicateLeafType(TypeDescription.Category columnCategory) { + switch (columnCategory){ + case BOOLEAN: + return PredicateLeaf.Type.BOOLEAN; + case BYTE: + case SHORT: + case INT: + case LONG: + return PredicateLeaf.Type.LONG; + case DATE: + return PredicateLeaf.Type.DATE; + case TIMESTAMP: + return PredicateLeaf.Type.TIMESTAMP; + case FLOAT: + case DOUBLE: + return PredicateLeaf.Type.FLOAT; + case STRING: + case CHAR: + case VARCHAR: + case DECIMAL: + return PredicateLeaf.Type.STRING; + default: + throw new IllegalStateException("Not supported type:" + columnCategory); + } + } + + private static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator, + PredicateLeaf.Type type, + String columnName, + Object literal) { + return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName, + literal, null); + } + + private static Options createOptions() { + Options result = new Options(); + + result.addOption(Option.builder("t") + .longOpt("type") + .desc(String.format("check type = {%s, %s, %s}", + CHECK_TYPE_PREDICATE, CHECK_TYPE_STAT, CHECK_TYPE_BLOOM_FILTER)) + .hasArg() + .build()); + + result.addOption(Option.builder("col") + .longOpt("column") + .desc("column name") + .hasArg() + .build()); + + result.addOption(Option.builder("v") + .longOpt("values") + .desc("test values") + .hasArgs() + .build()); + + result.addOption(Option.builder("h") + .longOpt("help") + .desc("print help message") + .build()); + return result; + } +} diff --git a/java/tools/src/java/org/apache/orc/tools/Driver.java b/java/tools/src/java/org/apache/orc/tools/Driver.java index 5b993c2e9c..0d2778b410 100644 --- a/java/tools/src/java/org/apache/orc/tools/Driver.java +++ b/java/tools/src/java/org/apache/orc/tools/Driver.java @@ -86,6 +86,7 @@ public static void main(String[] args) throws Exception { " [--define X=Y] "); System.err.println(); System.err.println("Commands:"); + System.err.println(" check - check the index of the specified column"); System.err.println(" convert - convert CSV/JSON/ORC files to ORC"); System.err.println(" count - recursively find *.orc and print the number of rows"); System.err.println(" data - print the data from the ORC file"); @@ -106,6 +107,9 @@ public static void main(String[] args) throws Exception { conf.set(pair.getKey().toString(), pair.getValue().toString()); } switch (options.command) { + case "check": + CheckTool.main(conf, options.commandArgs); + break; case "convert": ConvertTool.main(conf, options.commandArgs); break; diff --git a/java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java b/java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java new file mode 100644 index 0000000000..ada80a5695 --- /dev/null +++ b/java/tools/src/test/org/apache/orc/tools/bloomfilter/TestCheckTool.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.orc.tools.bloomfilter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.orc.OrcFile; +import org.apache.orc.TypeDescription; +import org.apache.orc.Writer; +import org.apache.orc.tools.CheckTool; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestCheckTool { + private Path workDir = new Path(System.getProperty("test.tmp.dir")); + private Configuration conf; + private FileSystem fs; + private Path testFilePath; + + @BeforeEach + public void openFileSystem() throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + fs.setWorkingDirectory(workDir); + testFilePath = new Path("TestCheckTool.testCheckTool.orc"); + fs.delete(testFilePath, false); + createFile(); + } + + private void createFile() throws IOException { + TypeDescription schema = TypeDescription.fromString("struct"); + Writer writer = OrcFile.createWriter(testFilePath, + OrcFile.writerOptions(conf) + .bloomFilterColumns("x,y") + .rowIndexStride(5000) + .setSchema(schema)); + VectorizedRowBatch batch = schema.createRowBatch(); + LongColumnVector x = (LongColumnVector) batch.cols[0]; + BytesColumnVector y = (BytesColumnVector) batch.cols[1]; + BytesColumnVector z = (BytesColumnVector) batch.cols[2]; + for (int r = 0; r < 10000; ++r) { + int row = batch.size++; + x.vector[row] = r; + byte[] yBuffer = ("y-byte-" + r).getBytes(); + byte[] zBuffer = ("z-byte-" + r).getBytes(); + y.setRef(row, yBuffer, 0, yBuffer.length); + z.setRef(row, zBuffer, 0, zBuffer.length); + if (batch.size == batch.getMaxSize()) { + writer.addRowBatch(batch); + batch.reset(); + } + } + if (batch.size != 0) { + writer.addRowBatch(batch); + } + writer.close(); + } + + @Test + public void testPredicate() throws Exception { + PrintStream origOut = System.out; + ByteArrayOutputStream myOut = new ByteArrayOutputStream(); + System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8)); + + CheckTool.main(conf, new String[]{ + "--type", "predicate", + "--values", "0", "--values", "5566", + "--column", "x", + testFilePath.toString()}); + + CheckTool.main(conf, new String[]{ + "--type", "predicate", + "--values", "y-byte-1234", "--values", "y-byte-5566", + "--column", "y", + testFilePath.toString()}); + + CheckTool.main(conf, new String[]{ + "--type", "predicate", + "--values", "z-byte-1234", "--values", "z-byte-5566", + "--column", "z", + testFilePath.toString()}); + + System.out.flush(); + System.setOut(origOut); + String output = myOut.toString(StandardCharsets.UTF_8); + + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 0, test value: YES_NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 5566, test value: NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 0, test value: NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 5566, test value: YES_NO")); + + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-1234, test value: YES_NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-5566, test value: NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-1234, test value: NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-5566, test value: YES_NO")); + + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-1234, test value: YES_NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-5566, test value: YES_NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-1234, test value: NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-5566, test value: YES_NO")); + } + + @Test + public void testStatistics() throws Exception { + PrintStream origOut = System.out; + ByteArrayOutputStream myOut = new ByteArrayOutputStream(); + System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8)); + + CheckTool.main(conf, new String[]{ + "--type", "stat", + "--values", "0", "--values", "5566", + "--column", "x", + testFilePath.toString()}); + + CheckTool.main(conf, new String[]{ + "--type", "stat", + "--values", "y-byte-1234", "--values", "y-byte-5566", + "--column", "y", + testFilePath.toString()}); + + CheckTool.main(conf, new String[]{ + "--type", "stat", + "--values", "z-byte-1234", "--values", "z-byte-5566", + "--column", "z", + testFilePath.toString()}); + + System.out.flush(); + System.setOut(origOut); + String output = myOut.toString(StandardCharsets.UTF_8); + + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 0, test value: YES_NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 5566, test value: NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 0, test value: NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 5566, test value: YES_NO")); + + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-1234, test value: YES_NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-5566, test value: YES_NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-1234, test value: NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-5566, test value: YES_NO")); + + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-1234, test value: YES_NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: z-byte-5566, test value: YES_NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-1234, test value: NO")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: z-byte-5566, test value: YES_NO")); + } + + @Test + public void testBloomFilter() throws Exception { + PrintStream origOut = System.out; + ByteArrayOutputStream myOut = new ByteArrayOutputStream(); + System.setOut(new PrintStream(myOut, false, StandardCharsets.UTF_8)); + + CheckTool.main(conf, new String[]{ + "--type", "bloom-filter", + "--values", "0", "--values", "5566", + "--column", "x", + testFilePath.toString()}); + + CheckTool.main(conf, new String[]{ + "--type", "bloom-filter", + "--values", "y-byte-1234", "--values", "y-byte-5566", + "--column", "y", + testFilePath.toString()}); + + CheckTool.main(conf, new String[]{ + "--type", "bloom-filter", + "--values", "z-byte-1234", "--values", "z-byte-5566", + "--column", "z", + testFilePath.toString()}); + + System.out.flush(); + System.setOut(origOut); + + String output = myOut.toString(StandardCharsets.UTF_8); + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 0, bloom filter: maybe exist")); + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: 5566, bloom filter: not exist")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 0, bloom filter: maybe exist")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: 5566, bloom filter: maybe exist")); + + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-1234, bloom filter: maybe exist")); + assertTrue(output.contains("stripe: 0, rowIndex: 0, value: y-byte-5566, bloom filter: not exist")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-1234, bloom filter: not exist")); + assertTrue(output.contains("stripe: 0, rowIndex: 1, value: y-byte-5566, bloom filter: maybe exist")); + } +} diff --git a/site/_docs/java-tools.md b/site/_docs/java-tools.md index 92a876a5f4..f537201133 100644 --- a/site/_docs/java-tools.md +++ b/site/_docs/java-tools.md @@ -11,6 +11,7 @@ supports both the local file system and HDFS. The subcommands for the tools are: + * check (since ORC 2.0.1) - check the index of the specified column * convert (since ORC 1.4) - convert CSV/JSON/ORC files to ORC * count (since ORC 1.6) - recursively find *.orc and print the number of rows * data - print the data of an ORC file @@ -27,6 +28,24 @@ The command line looks like: ~~~ shell % java -jar orc-tools-X.Y.Z-uber.jar ~~~ +## Java Check + +The check command can check whether the specified value of the column specified by multiple ORC files can be filtered. + +Check statistics and bloom filter index on x column. +~~~ shell +% java -jar orc-tools-X.Y.Z-uber.jar check --type predicate /path/to/example.orc --values 1234 --values 5566 --column x +~~~ + +Check statistics on x column. +~~~ shell +% java -jar orc-tools-X.Y.Z-uber.jar check --type stat /path/to/example.orc --values 1234 --values 5566 --column x +~~~ + +Check bloom filter index on x column. +~~~ shell +% java -jar orc-tools-X.Y.Z-uber.jar check --type bloom-filter /path/to/example.orc --values 1234 --values 5566 --column x +~~~ ## Java Convert