1. 问题描述

自建clickhouse集群，遇到几个问题：

建表不同步
插入数据不同步
查询数据不一致

其实均与配置有关，其中数据不一致问题，纠结比较久，这里总结分享一下。

2. 集群设计

clickhouse集群配置：5节点，每个节点2个实例，总共5个分片，每个分片2副本。

整理表格如下

节点

实例1

实例2

livelink1

livelink1:9000

分片01-副本01

livelink1:9002

分片02-副本02

livelink2

livelink2:9000

分片02-副本01

livelink2:9002

分片03-副本02

livelink3

livelink3:9000

分片03-副本01

livelink3:9002

分片04-副本02

livelink4

livelink4:9000

分片04-副本01

livelink4:9002

分片05-副本02

livelink5

livelink5:9000

分片05-副本01

livelink1:9002

分片01-副本02

架构设置如

3. 实例配置

3.1 config-node1.xml

以livelink1节点的实例1的config-node1.xml为例，标注红色部分是需要根据节点实例进行修改或者需要特别注意的

<?xml version=”1.0″?>

<yandex>

<logger>

<level>trace</level>

<log>/var/log/clickhouse-server/clickhouse-server-node1.log</log>

<errorlog>/var/log/clickhouse-server/clickhouse-server-node1.err.log</errorlog>

<size>1000M</size>

<count>10</count>

</logger>

<!– 根据具体节点和实例进行修改 –>

<http_port>8081</http_port>

<listen_host>::</listen_host>

<tcp_port>9000</tcp_port>

<mysql_port>9004</mysql_port>

<postgresql_port>9005</postgresql_port>

<interserver_http_port>9009</interserver_http_port>

<interserver_http_host>livelink1</interserver_http_host>

<path>/data/clickhouse/node1/</path>

<tmp_path>/data/clickhouse/node1/tmp/</tmp_path>

<format_schema_path>/data/clickhouse/node1/format_schemas/</format_schema_path>

<user_files_path>/data/clickhouse/node1/user_files/</user_files_path>

<user_directories>

<users_xml>

<path>users-node1.xml</path>

</users_xml>

<local_directory>

<path>/data/clickhouse/node1/access/</path>

</local_directory>

</user_directories>

<!– clickhouse_remote_servers、 zookeeper-servers、macros命名要与metrika-node1.xml中一致–>

<include_from>/etc/clickhouse-server/metrika-node1.xml</include_from>

<remote_servers incl=”clickhouse_remote_servers” />

<zookeeper incl=”zookeeper-servers” optional=”true” />

<macros incl=”macros” optional=”true” />

<max_connections>4096</max_connections>

<keep_alive_timeout>3</keep_alive_timeout>

<max_concurrent_queries>100</max_concurrent_queries>

<max_server_memory_usage>0</max_server_memory_usage>

<max_thread_pool_size>10000</max_thread_pool_size>

<max_server_memory_usage_to_ram_ratio>0.9</max_server_memory_usage_to_ram_ratio>

<total_memory_profiler_step>4194304</total_memory_profiler_step>

<total_memory_tracker_sample_probability>0</total_memory_tracker_sample_probability>

<uncompressed_cache_size>8589934592</uncompressed_cache_size>

<mark_cache_size>5368709120</mark_cache_size>

<mmap_cache_size>1000</mmap_cache_size>

<compiled_expression_cache_size>1073741824</compiled_expression_cache_size>

<default_profile>default</default_profile>

<custom_settings_prefixes></custom_settings_prefixes>

<default_database>default</default_database>

<mlock_executable>true</mlock_executable>

<!– Reallocate memory for machine code (“text”) using huge pages. Highly experimental. –>

<remap_executable>false</remap_executable>

<builtin_dictionaries_reload_interval>3600</builtin_dictionaries_reload_interval>

<max_session_timeout>3600</max_session_timeout>

<default_session_timeout>60</default_session_timeout>

<query_log>

<database>system</database>

<table>query_log</table>

<partition_by>toYYYYMM(event_date)</partition_by>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

</query_log>

<trace_log>

<database>system</database>

<table>trace_log</table>

<partition_by>toYYYYMM(event_date)</partition_by>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

</trace_log>

<query_thread_log>

<database>system</database>

<table>query_thread_log</table>

<partition_by>toYYYYMM(event_date)</partition_by>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

</query_thread_log>

<metric_log>

<database>system</database>

<table>metric_log</table>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

<collect_interval_milliseconds>1000</collect_interval_milliseconds>

</metric_log>

<asynchronous_metric_log>

<database>system</database>

<table>asynchronous_metric_log</table>

<flush_interval_milliseconds>60000</flush_interval_milliseconds>

</asynchronous_metric_log>

<!–

OpenTelemetry log contains OpenTelemetry trace spans.

–>

<opentelemetry_span_log>

<engine>

engine MergeTree

partition by toYYYYMM(finish_date)

order by (finish_date, finish_time_us, trace_id)

</engine>

<database>system</database>

<table>opentelemetry_span_log</table>

<flush_interval_milliseconds>7500</flush_interval_milliseconds>

</opentelemetry_span_log>

<crash_log>

<database>system</database>

<table>crash_log</table>

<partition_by />

<flush_interval_milliseconds>1000</flush_interval_milliseconds>

</crash_log>

<top_level_domains_lists></top_level_domains_lists>

<dictionaries_config>*_dictionary.xml</dictionaries_config>

<distributed_ddl>

<path>/clickhouse/task_queue/ddl</path>

</distributed_ddl>

<graphite_rollup_example>

<pattern>

<regexp>click_cost</regexp>

<function>any</function>

<retention>

<age>0</age>

<precision>3600</precision>

</retention>

<retention>

<age>86400</age>

<precision>60</precision>

</retention>

</pattern>

<default>

<function>max</function>

<retention>

<age>0</age>

<precision>60</precision>

</retention>

<retention>

<age>3600</age>

<precision>300</precision>

</retention>

<retention>

<age>86400</age>

<precision>3600</precision>

</retention>

</default>

</graphite_rollup_example>

<query_masking_rules>

<rule>

<name>hide encrypt/decrypt arguments</name>

<regexp>((?:aes_)?(?:encrypt|decrypt)(?:_mysql)?)\s*\(\s*(?:'(?:\\’|.)+’|.*?)\s*\)</regexp>

<replace>\1(???)</replace>

</rule>

</query_masking_rules>

<send_crash_reports>

<enabled>false</enabled>

<anonymize>false</anonymize>

<endpoint>https://6f33034cfe684dd7a3ab9875e57b1c8d@o388870.ingest.sentry.io/5226277</endpoint>

</send_crash_reports>

<http_server_default_response><![CDATA[<html ng-app=”SMI2″><head><base href=”http://ui.tabix.io/”></head><body><div ui-view=”” class=”content-ui”></div><script src=”http://loader.tabix.io/master.js”></script></body></html>]]></http_server_default_response>

</yandex>

3.2 user-node1.xml

<?xml version=”1.0″?>

<yandex>

<profiles>

<default>

<max_memory_usage>10000000000</max_memory_usage>

<load_balancing>random</load_balancing>

</default>

<readonly>

<readonly>1</readonly>

</readonly>

</profiles>

<users>

<!– 配置两个账户，一个可读写，一个只读 –>

<livelink_rw>

<password>xxx</password>

<networks>

<ip>::/0</ip>

</networks>

<profile>default</profile>

<quota>default</quota>

</livelink_rw>

<livelink_r>

<password>xxx</password>

<networks>

<ip>::/0</ip>

</networks>

<profile>readonly</profile>

<quota>default</quota>

</livelink_r>

</users>

<!– Quotas. –>

<quotas>

<!– Name of quota. –>

<default>

<!– Limits for time interval. You could specify many intervals with different limits. –>

<interval>

<!– Length of interval. –>

<duration>3600</duration>

<!– No limits. Just calculate resource usage for time interval. –>

<queries>0</queries>

<errors>0</errors>

<result_rows>0</result_rows>

<read_rows>0</read_rows>

<execution_time>0</execution_time>

</interval>

</default>

</quotas>

</yandex>

3.3 metrika-node1.xml

<?xml version=”1.0″?>

<yandex>

<!–server相关配置–>

<clickhouse_remote_servers>

<livelink_cluster>

<shard>

<weight>1</weight>

<internal_replication>true</internal_replication>

<replica>

<host>livelink1</host>

<port>9000</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

<replica>

<host>livelink5</host>

<port>9002</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

</shard>

<shard>

<weight>1</weight>

<internal_replication>true</internal_replication>

<replica>

<host>livelink1</host>

<port>9002</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

<replica>

<host>livelink2</host>

<port>9000</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

</shard>

<shard>

<weight>1</weight>

<internal_replication>true</internal_replication>

<replica>

<host>livelink2</host>

<port>9002</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

<replica>

<host>livelink3</host>

<port>9000</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

</shard>

<shard>

<weight>1</weight>

<internal_replication>true</internal_replication>

<replica>

<host>livelink3</host>

<port>9002</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

<replica>

<host>livelink4</host>

<port>9000</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

</shard>

<shard>

<weight>1</weight>

<internal_replication>true</internal_replication>

<replica>

<host>livelink4</host>

<port>9002</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

<replica>

<host>livelink5</host>

<port>9000</port>

<user>livelink_rw</user>

<password>livelink123</password>

</replica>

</shard>

</livelink_cluster>

</clickhouse_remote_servers>

<!–zookeeper相关配置–>

<zookeeper-servers>

<node index=”1″>

<host>livelink1</host>

<port>2181</port>

</node>

<node index=”2″>

<host>livelink2</host>

<port>2181</port>

</node>

<node index=”3″>

<host>livelink3</host>

<port>2181</port>

</node>

<node index=”4″>

<host>livelink4</host>

<port>2181</port>

</node>

<node index=”5″>

<host>livelink5</host>

<port>2181</port>

</node>

</zookeeper-servers>

<macros>

<layer>01</layer>

<shard>01</shard>

<replica>01-01-01</replica>

</macros>

<networks>

<ip>::/0</ip>

</networks>

<clickhouse_compression>

<case>

<min_part_size>10000000000</min_part_size>

<min_part_size_ratio>0.01</min_part_size_ratio>

<method>lz4</method>

</case>

</clickhouse_compression>

</yandex>

metrika-node1.xml中的clickhouse_remote_servers、zookeeper-servers、

macros

必须在config-node1.xml配置且命名一致，很重要！否则定义的

macros

不可用、表或者数据不能同步。

<include_from>/etc/clickhouse-server/metrika-node1.xml</include_from>

<remote_servers incl=”clickhouse_remote_servers” />

<zookeeper incl=”zookeeper-servers” optional=”true” />

<macros incl=”macros” optional=”true” />

每个实例的metrika配置文件，

macros

需要单独配置。刚开始以为

macros

只是宏定义，用于建表指定zookkeeper的时候，其实

macros

配置准确很重要，否则会影响分布式表查询不一致！！

命名规则可以自定义，但所有实例要一致！上面规则是：layer-shard-replical，下面是每个实例的

macros

实例：livelink1:9000

<macros>

<layer>01</layer>

<shard>01</shard>

<replica>01-01-01</replica>

</macros>

实例：livelink1:9002

<macros>

<layer>01</layer>

<shard>02</shard>

<replica>01-02-02</replica>

</macros>

实例：livelink2:9000

<macros>

<layer>01</layer>

<shard>02</shard>

<replica>01-02-01</replica>

</macros>

实例：livelink2:9002

<macros>

<layer>01</layer>

<shard>03</shard>

<replica>01-03-02</replica>

</macros>

实例：livelink3:9000

<macros>

<layer>01</layer>

<shard>01</shard>

<replica>01-03-01</replica>

</macros>

实例：livelink3:9002

<macros>

<layer>01</layer>

<shard>04</shard>

<replica>01-04-02</replica>

</macros>

实例：livelink4:9000

<macros>

<layer>01</layer>

<shard>04</shard>

<replica>01-04-01</replica>

</macros>

实例：livelink4:9002

<macros>

<layer>01</layer>

<shard>05</shard>

<replica>01-05-02</replica>

</macros>

实例：livelink5:9000

<macros>

<layer>01</layer>

<shard>05</shard>

<replica>01-05-01</replica>

</macros>

实例：livelink5:90002

<macros>

<layer>01</layer>

<shard>01</shard>

<replica>01-01-02</replica>

</macros>

4. 验证数据

4.1 创建复制表

只需要在其中一个实际上执行，每个实例会创建本地表

建表的时候，字段类型需要注意大小写，比如Int32不能写成int32，String不能写成string或者STRING，不然会报错

CREATE TABLE test.test_local on cluster livelink_cluster (

CounterID UInt32,

StartDate Date,

Sign Int8,

IsNew UInt8,

VisitID UInt64,

UserID UInt64,

StartTime DateTime

) ENGINE = ReplicatedMergeTree(‘/clickhouse/tables/{shard}/test.test_local’, ‘{replica}’)

PARTITION BY toYYYYMM(StartDate)

ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)

SAMPLE BY intHash32(UserID)

SETTINGS index_granularity = 8192

4.2 创建分布式表

create table test.test_all on cluster livelink_cluster as test.test_local

ENGINE = Distributed(livelink_cluster,test,test_local,rand())

4.3 插入数据

livelink2:9000实例本地复制表插入数据

insert into test_local values(1, ‘2021-07-01 00:11:22’, 1, 1, 1, 11, toDateTime(‘2021-07-08 00:11:22’));

4.4 查询数据

按上面的架构图，在livelink2:9000和livelink1:9000实例本地复制表分配对应两个副本，都可以查到数据，其他实例本地表查询不到，但每个实例都可以通过分布式表查询得到

livelink1 🙂

select * from test.test_local;

SELECT *

FROM test.test_local

Query id: 1478d7ff-3885-45bc-b0bf-1c3111e51dcb

┌─CounterID─┬──StartDate─┬─Sign─┬─IsNew─┬─VisitID─┬─UserID─┬───────────StartTime─┐

│ 1 │ 2021-07-01 │ 1 │ 1 │ 1 │ 11 │ 2021-07-08 00:11:22 │

└───────────┴────────────┴──────┴───────┴─────────┴────────┴─────────────────────┘

1 rows in set. Elapsed: 0.002 sec.

livelink1 🙂

select * from test.test_all;

SELECT *

FROM test.test_all

Query id: baf6b70d-2eaf-492a-b295-5ee9b6b226c3

┌─CounterID─┬──StartDate─┬─Sign─┬─IsNew─┬─VisitID─┬─UserID─┬───────────StartTime─┐

│ 1 │ 2021-07-01 │ 1 │ 1 │ 1 │ 11 │ 2021-07-08 00:11:22 │

└───────────┴────────────┴──────┴───────┴─────────┴────────┴─────────────────────┘

1 rows in set. Elapsed: 0.005 sec.

查询分布式表的时候要多查几次，看是否每次查询数据准确、一致。本人就踩过坑，由于10个实例上都macros设置，其中xxx:9002实例上的设置错了，插入一条数据，分布式表每次查询的结果往往不一致，慎重！

5. 回顾总结

总的来说，自建clickhouse集群，可能会遇到下面几个问题，重点检查一下配置

建表不同步——检查macros、click_remote_server、zooker-server配置
插入数据不同步——检查macros、click_remote_server配置
查询数据不一致——检查macros配置

原文链接：https://blog.csdn.net/L13763338360/article/details/119210100