Skip to content

Commit

Permalink
Merge branch 'master' into keeper-disk-move-fix
Browse files Browse the repository at this point in the history
  • Loading branch information
antonio2368 committed Feb 27, 2024
2 parents 178c60a + c8db540 commit 89789f8
Show file tree
Hide file tree
Showing 249 changed files with 7,716 additions and 2,111 deletions.
1 change: 1 addition & 0 deletions base/base/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ set (CMAKE_CXX_STANDARD 20)

set (SRCS
argsToConfig.cpp
cgroupsv2.cpp
coverage.cpp
demangle.cpp
getAvailableMemoryAmount.cpp
Expand Down
64 changes: 64 additions & 0 deletions base/base/cgroupsv2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#include <base/cgroupsv2.h>

#include <base/defines.h>

#include <fstream>
#include <sstream>


bool cgroupsV2Enabled()
{
#if defined(OS_LINUX)
/// This file exists iff the host has cgroups v2 enabled.
auto controllers_file = default_cgroups_mount / "cgroup.controllers";
if (!std::filesystem::exists(controllers_file))
return false;
return true;
#else
return false;
#endif
}

bool cgroupsV2MemoryControllerEnabled()
{
#if defined(OS_LINUX)
chassert(cgroupsV2Enabled());
/// According to https://docs.kernel.org/admin-guide/cgroup-v2.html:
/// - file 'cgroup.controllers' defines which controllers *can* be enabled
/// - file 'cgroup.subtree_control' defines which controllers *are* enabled
/// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group.
std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
if (!subtree_control_file.is_open())
return false;
std::string controllers;
std::getline(subtree_control_file, controllers);
if (controllers.find("memory") == std::string::npos)
return false;
return true;
#else
return false;
#endif
}

std::string cgroupV2OfProcess()
{
#if defined(OS_LINUX)
chassert(cgroupsV2Enabled());
/// All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs
/// A simpler way to get the membership is:
std::ifstream cgroup_name_file("/proc/self/cgroup");
if (!cgroup_name_file.is_open())
return "";
/// With cgroups v2, there will be a *single* line with prefix "0::/"
/// (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
std::string cgroup;
std::getline(cgroup_name_file, cgroup);
static const std::string v2_prefix = "0::/";
if (!cgroup.starts_with(v2_prefix))
return "";
cgroup = cgroup.substr(v2_prefix.length());
return cgroup;
#else
return "";
#endif
}
22 changes: 22 additions & 0 deletions base/base/cgroupsv2.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#pragma once

#include <filesystem>
#include <string>

#if defined(OS_LINUX)
/// I think it is possible to mount the cgroups hierarchy somewhere else (e.g. when in containers).
/// /sys/fs/cgroup was still symlinked to the actual mount in the cases that I have seen.
static inline const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";
#endif

/// Is cgroups v2 enabled on the system?
bool cgroupsV2Enabled();

/// Is the memory controller of cgroups v2 enabled on the system?
/// Assumes that cgroupsV2Enabled() is enabled.
bool cgroupsV2MemoryControllerEnabled();

/// Which cgroup does the process belong to?
/// Returns an empty string if the cgroup cannot be determined.
/// Assumes that cgroupsV2Enabled() is enabled.
std::string cgroupV2OfProcess();
48 changes: 5 additions & 43 deletions base/base/getMemoryAmount.cpp
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
#include <base/getMemoryAmount.h>

#include <base/cgroupsv2.h>
#include <base/getPageSize.h>

#include <fstream>
#include <sstream>
#include <stdexcept>

#include <unistd.h>
#include <sys/types.h>
#include <sys/param.h>
#if defined(BSD)
#include <sys/sysctl.h>
#endif


namespace
Expand All @@ -20,49 +17,14 @@ namespace
std::optional<uint64_t> getCgroupsV2MemoryLimit()
{
#if defined(OS_LINUX)
const std::filesystem::path default_cgroups_mount = "/sys/fs/cgroup";

/// This file exists iff the host has cgroups v2 enabled.
std::ifstream controllers_file(default_cgroups_mount / "cgroup.controllers");
if (!controllers_file.is_open())
return {};

/// Make sure that the memory controller is enabled.
/// - cgroup.controllers defines which controllers *can* be enabled.
/// - cgroup.subtree_control defines which controllers *are* enabled.
/// (see https://docs.kernel.org/admin-guide/cgroup-v2.html)
/// Caveat: nested groups may disable controllers. For simplicity, check only the top-level group.
/// ReadBufferFromFile subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
/// std::string subtree_control;
/// readString(subtree_control, subtree_control_file);
/// if (subtree_control.find("memory") == std::string::npos)
/// return {};
std::ifstream subtree_control_file(default_cgroups_mount / "cgroup.subtree_control");
std::stringstream subtree_control_buf;
subtree_control_buf << subtree_control_file.rdbuf();
std::string subtree_control = subtree_control_buf.str();
if (subtree_control.find("memory") == std::string::npos)
return {};

/// Identify the cgroup the process belongs to
/// All PIDs assigned to a cgroup are in /sys/fs/cgroups/{cgroup_name}/cgroup.procs
/// A simpler way to get the membership is:
std::ifstream cgroup_name_file("/proc/self/cgroup");
if (!cgroup_name_file.is_open())
if (!cgroupsV2Enabled())
return {};

std::stringstream cgroup_name_buf;
cgroup_name_buf << cgroup_name_file.rdbuf();
std::string cgroup_name = cgroup_name_buf.str();
if (!cgroup_name.empty() && cgroup_name.back() == '\n')
cgroup_name.pop_back(); /// remove trailing newline, if any
/// With cgroups v2, there will be a *single* line with prefix "0::/"
const std::string v2_prefix = "0::/";
if (!cgroup_name.starts_with(v2_prefix))
if (!cgroupsV2MemoryControllerEnabled())
return {};
cgroup_name = cgroup_name.substr(v2_prefix.length());

std::filesystem::path current_cgroup = cgroup_name.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup_name);
std::string cgroup = cgroupV2OfProcess();
auto current_cgroup = cgroup.empty() ? default_cgroups_mount : (default_cgroups_mount / cgroup);

/// Open the bottom-most nested memory limit setting file. If there is no such file at the current
/// level, try again at the parent level as memory settings are inherited.
Expand Down
2 changes: 1 addition & 1 deletion contrib/liburing
Submodule liburing updated 173 files
2 changes: 1 addition & 1 deletion docker/test/fuzzer/run-fuzzer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ function download

chmod +x clickhouse
# clickhouse may be compressed - run once to decompress
./clickhouse ||:
./clickhouse --query "SELECT 1" ||:
ln -s ./clickhouse ./clickhouse-server
ln -s ./clickhouse ./clickhouse-client
ln -s ./clickhouse ./clickhouse-local
Expand Down
8 changes: 7 additions & 1 deletion docs/en/engines/table-engines/mergetree-family/mergetree.md
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,11 @@ Tags:
- `load_balancing` - Policy for disk balancing, `round_robin` or `least_used`.
- `least_used_ttl_ms` - Configure timeout (in milliseconds) for the updating available space on all disks (`0` - update always, `-1` - never update, default is `60000`). Note, if the disk can be used by ClickHouse only and is not subject to a online filesystem resize/shrink you can use `-1`, in all other cases it is not recommended, since eventually it will lead to incorrect space distribution.
- `prefer_not_to_merge` — You should not use this setting. Disables merging of data parts on this volume (this is harmful and leads to performance degradation). When this setting is enabled (don't do it), merging data on this volume is not allowed (which is bad). This allows (but you don't need it) controlling (if you want to control something, you're making a mistake) how ClickHouse works with slow disks (but ClickHouse knows better, so please don't use this setting).
- `volume_priority` — Defines the priority (order) in which volumes are filled. Lower value means higher priority. The parameter values should be natural numbers and collectively cover the range from 1 to N (lowest priority given) without skipping any numbers.
* If _all_ volumes are tagged, they are prioritized in given order.
* If only _some_ volumes are tagged, those without the tag have the lowest priority, and they are prioritized in the order they are defined in config.
* If _no_ volumes are tagged, their priority is set correspondingly to their order they are declared in configuration.
* Two volumes cannot have the same priority value.

Configuration examples:

Expand Down Expand Up @@ -919,7 +924,8 @@ In given example, the `hdd_in_order` policy implements the [round-robin](https:/
If there are different kinds of disks available in the system, `moving_from_ssd_to_hdd` policy can be used instead. The volume `hot` consists of an SSD disk (`fast_ssd`), and the maximum size of a part that can be stored on this volume is 1GB. All the parts with the size larger than 1GB will be stored directly on the `cold` volume, which contains an HDD disk `disk1`.
Also, once the disk `fast_ssd` gets filled by more than 80%, data will be transferred to the `disk1` by a background process.

The order of volume enumeration within a storage policy is important. Once a volume is overfilled, data are moved to the next one. The order of disk enumeration is important as well because data are stored on them in turns.
The order of volume enumeration within a storage policy is important in case at least one of the volumes listed has no explicit `volume_priority` parameter.
Once a volume is overfilled, data are moved to the next one. The order of disk enumeration is important as well because data are stored on them in turns.

When creating a table, one can apply one of the configured storage policies to it:

Expand Down
5 changes: 1 addition & 4 deletions docs/en/getting-started/install.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,7 @@ It is recommended to use official pre-compiled `deb` packages for Debian or Ubun
#### Setup the Debian repository
``` bash
sudo apt-get install -y apt-transport-https ca-certificates dirmngr
GNUPGHOME=$(mktemp -d)
sudo GNUPGHOME="$GNUPGHOME" gpg --no-default-keyring --keyring /usr/share/keyrings/clickhouse-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 8919F6BD2B48D754
sudo rm -rf "$GNUPGHOME"
sudo chmod +r /usr/share/keyrings/clickhouse-keyring.gpg
sudo gpg --no-default-keyring --keyring /usr/share/keyrings/clickhouse-keyring.gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 8919F6BD2B48D754
echo "deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb stable main" | sudo tee \
/etc/apt/sources.list.d/clickhouse.list
Expand Down
46 changes: 46 additions & 0 deletions docs/en/operations/server-configuration-parameters/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,20 @@ Type: Bool

Default: 0


## dns_cache_max_size

Internal DNS cache max size in bytes.

:::note
ClickHouse also has a reverse cache, so the actual memory usage could be twice as much.
:::

Type: UInt64

Default: 1024


## dns_cache_update_period

Internal DNS cache update period in seconds.
Expand Down Expand Up @@ -458,6 +472,38 @@ Type: Double

Default: 0.9

## cgroups_memory_usage_observer_wait_time

Interval in seconds during which the server's maximum allowed memory consumption is adjusted by the corresponding threshold in cgroups. (see
settings `cgroup_memory_watcher_hard_limit_ratio` and `cgroup_memory_watcher_soft_limit_ratio`).

Type: UInt64

Default: 15

## cgroup_memory_watcher_hard_limit_ratio

Specifies the "hard" threshold with regards to the memory consumption of the server process according to cgroups after which the server's
maximum memory consumption is adjusted to the threshold value.

See settings `cgroups_memory_usage_observer_wait_time` and `cgroup_memory_watcher_soft_limit_ratio`

Type: Double

Default: 0.95

## cgroup_memory_watcher_soft_limit_ratio

Specifies the "soft" threshold with regards to the memory consumption of the server process according to cgroups after which arenas in
jemalloc are purged.


See settings `cgroups_memory_usage_observer_wait_time` and `cgroup_memory_watcher_hard_limit_ratio`

Type: Double

Default: 0.95

## max_table_size_to_drop

Restriction on deleting tables.
Expand Down
12 changes: 8 additions & 4 deletions docs/en/operations/settings/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ By default: 1,000,000. It only works when reading from MergeTree engines.

## max_concurrent_queries_for_user {#max-concurrent-queries-for-user}

The maximum number of simultaneously processed queries related to MergeTree table per user.
The maximum number of simultaneously processed queries per user.

Possible values:

Expand Down Expand Up @@ -1776,7 +1776,7 @@ Default value: 0 (no restriction).
## insert_quorum {#insert_quorum}

:::note
`insert_quorum` does not apply when using the [`SharedMergeTree` table engine](/en/cloud/reference/shared-merge-tree) in ClickHouse Cloud as all inserts are quorum inserted.
This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information.
:::

Enables the quorum writes.
Expand Down Expand Up @@ -1819,7 +1819,7 @@ See also:
## insert_quorum_parallel {#insert_quorum_parallel}

:::note
`insert_quorum_parallel` does not apply when using the [`SharedMergeTree` table engine](/en/cloud/reference/shared-merge-tree) in ClickHouse Cloud as all inserts are quorum inserted.
This setting is not applicable to SharedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information.
:::

Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected.
Expand All @@ -1839,6 +1839,10 @@ See also:

## select_sequential_consistency {#select_sequential_consistency}

:::note
This setting differ in behavior between SharedMergeTree and ReplicatedMergeTree, see [SharedMergeTree consistency](/docs/en/cloud/reference/shared-merge-tree/#consistency) for more information about the behavior of `select_sequential_consistency` in SharedMergeTree.
:::

Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default).

Possible values:
Expand Down Expand Up @@ -2037,7 +2041,7 @@ Possible values:
- 0 — Disabled.
- 1 — Enabled.

Default value: 1.
Default value: 0.

By default, async inserts are inserted into replicated tables by the `INSERT` statement enabling [async_insert](#async-insert) are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)).
For the replicated tables, by default, only 10000 of the most recent inserts for each partition are deduplicated (see [replicated_deduplication_window_for_async_inserts](merge-tree-settings.md/#replicated-deduplication-window-async-inserts), [replicated_deduplication_window_seconds_for_async_inserts](merge-tree-settings.md/#replicated-deduplication-window-seconds-async-inserts)).
Expand Down
38 changes: 38 additions & 0 deletions docs/en/operations/system-tables/dns_cache.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
slug: /en/operations/system-tables/dns_cache
---
# dns_cache

Contains information about cached DNS records.

Columns:

- `hostname` ([String](../../sql-reference/data-types/string.md)) — cached hostname
- `ip_address` ([String](../../sql-reference/data-types/string.md)) — ip address for the hostname
- `ip_family` ([Enum](../../sql-reference/data-types/enum.md)) — family of the ip address, possible values:
- 'IPv4'
- 'IPv6'
- 'UNIX_LOCAL'
- `cached_at` ([DateTime](../../sql-reference/data-types/datetime.md)) - when the record was cached

**Example**

Query:

```sql
SELECT * FROM system.dns_cache;
```

Result:

| hostname | ip\_address | ip\_family | cached\_at |
| :--- | :--- | :--- | :--- |
| localhost | ::1 | IPv6 | 2024-02-11 17:04:40 |
| localhost | 127.0.0.1 | IPv4 | 2024-02-11 17:04:40 |

**See also**

- [disable_internal_dns_cache setting](../../operations/server-configuration-parameters/settings.md#disable_internal_dns_cache)
- [dns_cache_max_size setting](../../operations/server-configuration-parameters/settings.md#dns_cache_max_size)
- [dns_cache_update_period setting](../../operations/server-configuration-parameters/settings.md#dns_cache_update_period)
- [dns_max_consecutive_failures setting](../../operations/server-configuration-parameters/settings.md#dns_max_consecutive_failures)
32 changes: 32 additions & 0 deletions docs/en/operations/system-tables/settings_changes.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
---
slug: /en/operations/system-tables/settings_changes
---
# settings_changes

Contains information about setting changes in previous ClickHouse versions.

Columns:

- `version` ([String](../../sql-reference/data-types/string.md)) — The ClickHouse version in which settings were changed
- `changes` ([Array](../../sql-reference/data-types/array.md) of [Tuple](../../sql-reference/data-types/tuple.md)) — A description of the setting changes: (setting name, previous value, new value, reason for the change)

**Example**

``` sql
SELECT *
FROM system.settings_changes
WHERE version = '23.5'
FORMAT Vertical
```

``` text
Row 1:
──────
version: 23.5
changes: [('input_format_parquet_preserve_order','1','0','Allow Parquet reader to reorder rows for better parallelism.'),('parallelize_output_from_storages','0','1','Allow parallelism when executing queries that read from file/url/s3/etc. This may reorder rows.'),('use_with_fill_by_sorting_prefix','0','1','Columns preceding WITH FILL columns in ORDER BY clause form sorting prefix. Rows with different values in sorting prefix are filled independently'),('output_format_parquet_compliant_nested_types','0','1','Change an internal field name in output Parquet file schema.')]
```

**See also**

- [Settings](../../operations/settings/index.md#session-settings-intro)
- [system.settings](settings.md)
Loading

0 comments on commit 89789f8

Please sign in to comment.