Merge remote-tracking branch 'upstream/main' into replace

# Conflicts: # src/query/service/src/interpreters/interpreter_copy.rs
databendlabs · Jul 18, 2023 · 21449ee · 21449ee
2 parents a5782f5 + 1628857
commit 21449ee
Show file tree

Hide file tree

Showing 146 changed files with 4,280 additions and 663 deletions.
diff --git a/.github/workflows/bindings_python.yml b/.github/workflows/bindings_python.yml
@@ -90,7 +90,8 @@ jobs:
           rust-toolchain: ${{ steps.toolchain.outputs.RUST_TOOLCHAIN }}
           working-directory: src/bendpy
           target: ${{ matrix.target.name }}
-          manylinux: auto
+          # Using version 2_28 cause https://github.com/PyO3/maturin-action/issues/197
+          manylinux: 2_28
           # Keep them in one line due to https://github.com/PyO3/maturin-action/issues/153
           rustup-components: rust-std rustfmt
           args: ${{ steps.opts.outputs.BUILD_ARGS }}

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -214,3 +214,4 @@ rpath = false
 arrow-format = { git = "https://github.com/sundy-li/arrow-format", rev = "c8e11341" }
 parquet2 = { git = "https://github.com/jorgecarleitao/parquet2", rev = "ed0e1ff" }
 metrics = { git = "https://github.com/datafuse-extras/metrics.git", rev = "fc2ecd1" }
+croaring = { git = "https://github.com/RoaringBitmap/croaring-rs", rev = "13a71b4" }
diff --git a/docs/doc/13-sql-reference/30-table-engines/00-fuse.md b/docs/doc/13-sql-reference/30-table-engines/00-fuse.md
@@ -2,9 +2,9 @@
 title: Fuse Engine
 ---
 
-## Description
+Databend utilizes the Fuse engine as its default engine, offering a data management system with a user-friendly interface reminiscent of Git. Users have the ability to effortlessly query data at any given moment and effortlessly restore data to any desired point in time.
 
-Fuse engine is the default engine for Databend, it provides a git-like interface for data management. User could query data at any point in time, and restore data to any point in time, there is a blog post about this feature: [Time Travel](https://databend.rs/blog/time-travel).
+**Related topic**: [Find Peter Parker in Databend](https://databend.rs/blog/time-travel)
 
 ## Syntax
 
@@ -13,38 +13,29 @@ CREATE TABLE table_name (
   column_name1 column_type1,
   column_name2 column_type2,
   ...
-) [ENGINE = Fuse] [CLUSTER BY(<expr> [, <expr>, ...] )] [options];
+) [ENGINE = Fuse] [CLUSTER BY(<expr> [, <expr>, ...] )] [Options];
 ```
 
-Read more about the created table statement in [ddl-create-table](../../14-sql-commands/00-ddl/20-table/10-ddl-create-table.md)
+For more information about the CREATE TABLE command, see [CREATE TABLE](../../14-sql-commands/00-ddl/20-table/10-ddl-create-table.md).
 
-### Default engine
+### ENGINE
 
-If engine is not specified, we will default to using `Engine = Fuse`.
+If an engine is not explicitly specified, Databend will automatically default to using the Fuse engine to create tables, which is equivalent to `Engine = Fuse`.
 
-
-### Cluster Key
+### CLUSTER BY
 
 The `CLUSTER BY` parameter specifies the sorting method for data that consists of multiple expressions, which is useful during compaction or recluster. A suitable `CLUSTER BY` parameter can significantly accelerate queries.
 
-
 ### Options
 
-Fuse engine support following common case-insensitive options:
-
-- `compression = '<compression>'`, `compression` could be `lz4`, `zstd`, `snappy`, `none`. Compression method defaults to be `zstd` in object storage but `lz4` in fs storage.
-
-- `storage_format = '<storage_format>'`, `storage_format` could be `parquet` and `native`. Storage format defaults to be `parquet` in object storage but `native` in fs storage.
-
-- `snapshot_loc = '<snapshot_loc>'`, it's a location parameter in string which could easily share a table without data copy.
-
-- `block_size_threshold = '<block_size_threshold>'`, specifies the maximum data size for a file.
-- `block_per_segment = '<block_per_segment>'`, specifies the maximum number of files that can be stored in a segment.
-- `row_per_block = '<row_per_block>'`, specifies the maximum number of rows that can be stored in a file.
-
-
-## What's storage format
-
-By default, the storage_format is set to Parquet, which means the data is stored in Parquet format in the storage. Parquet is an open format that is suitable for cloud-native object storage and has a high compression ratio.
-
-The storage_format also supports the Native format, which is an experimental format that primarily optimizes the additional memory copy overhead introduced when writing data to the storage. The Native format is suitable for storage devices such as file systems.
+The Fuse engine offers options(case-insensitive) that allow you to configure various settings such as bloom index columns, compression method, storage format, snapshot location, block size threshold, block per segment, and row per block. To modify the options of an existing table, use [ALTER TABLE OPTION](../../14-sql-commands/00-ddl/20-table/90-alter-table-option.md).
+
+| Option               	| Syntax                                              	| Description                                                                                                                                                                                                                                                                                           	|
+|----------------------	|-----------------------------------------------------	|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------	|
+| bloom_index_columns  	| `bloom_index_columns = '<column> [, <column> ...]'` 	| Specifies the columns to be used for the bloom index. The data type of these columns can be Map, Number, String, Date, or Timestamp. If no specific columns are specified, the bloom index is created by default on all supported columns. `bloom_index_columns=''` disables the bloom indexing.                                                            	|
+| compression          	| `compression = '<compression>'`                     	| Specifies the compression method for the engine. Compression options include lz4, zstd, snappy, or none. The compression method defaults to zstd in object storage and lz4 in file system (fs) storage.                                                                                               	|
+| storage_format       	| `storage_format = '<storage_format>'`               	| Specifies how data is stored. By default, the storage_format is set to **Parquet**, which offers high compression and is ideal for cloud-native object storage. Additionally, the experimental **Native** format is supported, optimizing memory copy overhead for storage devices like file systems. 	|
+| snapshot_loc         	| `snapshot_loc = '<snapshot_loc>'`                   	| Specifies a location parameter in string format, allowing easy sharing of a table without data copy.                                                                                                                                                                                                  	|
+| block_size_threshold 	| `block_size_threshold = '<block_size_threshold>'`   	| Specifies the maximum data size for a file.                                                                                                                                                                                                                                                           	|
+| block_per_segment    	| `block_per_segment = '<block_per_segment>'`         	| Specifies the maximum number of files that can be stored in a segment.                                                                                                                                                                                                                                	|
+| row_per_block        	| `row_per_block = '<row_per_block>'`                 	| Specifies the maximum number of rows that can be stored in a file.                                                                                                                                                                                                                                    	|
diff --git a/docs/doc/13-sql-reference/41-access-control-privileges.md b/docs/doc/13-sql-reference/41-access-control-privileges.md
@@ -81,13 +81,14 @@ Databend offers a range of privileges that allow you to exercise fine-grained co
 
 ## Database Privileges
 
+Please note that you can use the [USE DATABASE](../14-sql-commands/00-ddl/10-database/ddl-use-database.md) command to specify a database once you have any of the following privileges to the database or any privilege to a table in the database.
+
 | Privilege | Description                                                                     |
 |:----------|:--------------------------------------------------------------------------------|
 | Alter     | Renames a database.                                                             |
 | CREATE    | Creates a database.                                                             |
 | DROP      | Drops or undrops a database. Restores the recent version of a dropped database. |
-| SELECT    | SHOW CREATE a database. USE a database.                                         |
-
+| SELECT    | SHOW CREATE a database.                                                         |
 
 ## Session Policy Privileges
 

diff --git a/docs/doc/14-sql-commands/00-ddl/10-database/ddl-use-database.md b/docs/doc/14-sql-commands/00-ddl/10-database/ddl-use-database.md
@@ -0,0 +1,51 @@
+---
+title: USE DATABASE
+---
+
+Selects a database for the current session. This statement allows you to specify and switch to a different database. Once you set the current database using this command, it remains the same until the end of the session unless you choose to change it.
+
+## Syntax
+
+```sql
+USE <database_name>
+```
+
+## Examples
+
+```sql
+-- Create two databases
+CREATE DATABASE database1;
+CREATE DATABASE database2;
+
+-- Select and use "database1" as the current database
+USE database1;
+
+-- Create a new table "table1" in "database1"
+CREATE TABLE table1 (
+  id INT,
+  name VARCHAR(50)
+);
+
+-- Insert data into "table1"
+INSERT INTO table1 (id, name) VALUES (1, 'John');
+INSERT INTO table1 (id, name) VALUES (2, 'Alice');
+
+-- Query all data from "table1"
+SELECT * FROM table1;
+
+-- Switch to "database2" as the current database
+USE database2;
+
+-- Create a new table "table2" in "database2"
+CREATE TABLE table2 (
+  id INT,
+  city VARCHAR(50)
+);
+
+-- Insert data into "table2"
+INSERT INTO table2 (id, city) VALUES (1, 'New York');
+INSERT INTO table2 (id, city) VALUES (2, 'London');
+
+-- Query all data from "table2"
+SELECT * FROM table2;
+```
diff --git a/docs/doc/14-sql-commands/00-ddl/20-table/60-optimize-table.md b/docs/doc/14-sql-commands/00-ddl/20-table/60-optimize-table.md
@@ -59,14 +59,17 @@ FROM
 ```
 
 **Syntax**
+
 ```sql
 OPTIMIZE TABLE [database.]table_name COMPACT SEGMENT [LIMIT <segment_count>]    
 ```
 
 Compacts the table data by merging small segments into larger ones.
+
 - The option LIMIT sets the maximum number of segments to be compacted. In this case, Databend will select and compact the latest segments.
 
 **Example**
+
 ```sql
 -- Check whether need segment compact
 SELECT
@@ -140,20 +143,26 @@ We recommend performing segment compaction first, followed by block compaction.
 OPTIMIZE TABLE [database.]table_name COMPACT [LIMIT <segment_count>]    
 ```
 Compacts the table data by merging small blocks and segments into larger ones.
+
 - This command creates a new snapshot (along with compacted segments and blocks) of the most recent table data without affecting the existing storage files, so the storage space won't be released until you purge the historical data.
+
 - Depending on the size of the given table, it may take quite a while to complete the execution.
+
 - The option LIMIT sets the maximum number of segments to be compacted. In this case, Databend will select and compact the latest segments.
 
+- Databend will automatically re-cluster a clustered table after the compacting process.
+
 **Example**
 ```sql
 OPTIMIZE TABLE my_database.my_table COMPACT LIMIT 50;
 ```
 
 ## Purging
 
-Purging permanently removes historical data, including unused snapshots, segments, and blocks, from your storage. 
-It can save storage space but may affect the Time Travel feature. Consider purging when:
+Purging permanently removes historical data, including unused snapshots, segments, and blocks, except for the snapshots within the retention period (including the segments and blocks referenced by this snapshot), which will be retained. This can save storage space but may affect the Time Travel feature. Consider purging when:
+
 - The storage cost is a major concern, and you don't require historical data for Time Travel or other purposes.
+
 - You've compacted your table and want to remove older, unused data.
 
 :::note
@@ -163,25 +172,19 @@ Historical data within the default retention period of 12 hours will not be remo
 **Syntax**
 
 ```sql
--- Purge historical data
-OPTIMIZE TABLE [database.]table_name PURGE
-
--- Purge historical data generated before a snapshot or a timestamp was created
-OPTIMIZE TABLE [database.]table_name PURGE BEFORE (SNAPSHOT => '<SNAPSHOT_ID>')
-OPTIMIZE TABLE [database.]table_name PURGE BEFORE (TIMESTAMP => '<TIMESTAMP>'::TIMESTAMP)
+OPTIMIZE TABLE <table_name> PURGE [BEFORE (SNAPSHOT => '<SNAPSHOT_ID>') 
+| (TIMESTAMP => '<TIMESTAMP>'::TIMESTAMP)] [LIMIT <snapshot_count>]
 ```
 
-- `OPTIMIZE TABLE <table_name> PURGE`
-
-  Purges historical data from the table. Only the latest snapshot (including the segments and blocks referenced by this snapshot) will be kept.
+- `[BEFORE (SNAPSHOT => '<SNAPSHOT_ID>') 
+| (TIMESTAMP => '<TIMESTAMP>'::TIMESTAMP)]`
 
-- `OPTIMIZE TABLE <table_name> PURGE BEFORE (SNAPSHOT => '<SNAPSHOT_ID>')`
+  Purges the historical data that was generated before the specified snapshot or timestamp was created.
 
-  Purges the historical data that was generated before the specified snapshot was created. This erases related snapshots, segments, and blocks from storage.
+- `[LIMIT <snapshot_count>]`
 
-- `OPTIMIZE TABLE <table_name> PURGE BEFORE (TIMESTAMP => '<TIMESTAMP>'::TIMESTAMP)`
+  Sets the maximum number of snapshots to be purged. Databend will select and purge the oldest snapshots.
 
-  Purges the historical data that was generated before the specified timestamp was created. This erases related snapshots, segments, and blocks from storage.
 
 **Example**