Postgres - 创建表非常慢 - 带有分区和 BRIN 索引

dan*_*ang 7 sql postgresql query-performance

我有一张包含超过 2.6 亿条记录的表。我已经为表创建了分区并对其进行了索引。

CREATE TABLE qt_al_90k
(
    rec_id integer,
    user_id integer,
    user_text text,
    user_number double precision,
    user_date date,
    user_seq integer,
    my_sequence integer
) PARTITION BY RANGE (rec_id);
Run Code Online (Sandbox Code Playgroud)

分区查询:

CREATE TABLE qt_al_90k_rec_id_1 PARTITION OF qt_al_90k FOR VALUES FROM (0) TO (100000);
CREATE TABLE qt_al_90k_rec_id_2 PARTITION OF qt_al_90k FOR VALUES FROM (100000) TO (200000);
CREATE TABLE qt_al_90k_rec_id_3 PARTITION OF qt_al_90k FOR VALUES FROM (200000) TO (300000);
CREATE TABLE qt_al_90k_rec_id_4 PARTITION OF qt_al_90k FOR VALUES FROM (300000) TO (400000);
CREATE TABLE qt_al_90k_rec_id_5 PARTITION OF qt_al_90k FOR VALUES FROM (400000) TO (500000);
CREATE TABLE qt_al_90k_rec_id_6 PARTITION OF qt_al_90k FOR VALUES FROM (500000) TO (600000);
CREATE TABLE qt_al_90k_rec_id_7 PARTITION OF qt_al_90k FOR VALUES FROM (600000) TO (700000);
CREATE TABLE qt_al_90k_rec_id_8 PARTITION OF qt_al_90k FOR VALUES FROM (700000) TO (800000);
CREATE TABLE qt_al_90k_rec_id_9 PARTITION OF qt_al_90k FOR VALUES FROM (800000) TO (900000);
CREATE TABLE qt_al_90k_rec_id_10 PARTITION OF qt_al_90k FOR VALUES FROM (900000) TO (1000000);
CREATE TABLE qt_al_90k_rec_id_11 PARTITION OF qt_al_90k FOR VALUES FROM (1000000) TO (1100000);
CREATE TABLE qt_al_90k_rec_id_12 PARTITION OF qt_al_90k FOR VALUES FROM (1100000) TO (1200000);
CREATE TABLE qt_al_90k_rec_id_13 PARTITION OF qt_al_90k FOR VALUES FROM (1200000) TO (1300000);
CREATE TABLE qt_al_90k_rec_id_14 PARTITION OF qt_al_90k FOR VALUES FROM (1300000) TO (1400000);
CREATE TABLE qt_al_90k_rec_id_15 PARTITION OF qt_al_90k FOR VALUES FROM (1400000) TO (1500000);
CREATE TABLE qt_al_90k_rec_id_16 PARTITION OF qt_al_90k FOR VALUES FROM (1500000) TO (1600000);
CREATE TABLE qt_al_90k_rec_id_17 PARTITION OF qt_al_90k FOR VALUES FROM (1600000) TO (1700000);
CREATE TABLE qt_al_90k_rec_id_18 PARTITION OF qt_al_90k FOR VALUES FROM (1700000) TO (1800000);
CREATE TABLE qt_al_90k_rec_id_19 PARTITION OF qt_al_90k FOR VALUES FROM (1800000) TO (1900000);
CREATE TABLE qt_al_90k_rec_id_20 PARTITION OF qt_al_90k FOR VALUES FROM (1900000) TO (2000000);
CREATE TABLE qt_al_90k_rec_id_21 PARTITION OF qt_al_90k FOR VALUES FROM (2000000) TO (2100000);
CREATE TABLE qt_al_90k_rec_id_22 PARTITION OF qt_al_90k FOR VALUES FROM (2100000) TO (2200000);
CREATE TABLE qt_al_90k_rec_id_23 PARTITION OF qt_al_90k FOR VALUES FROM (2200000) TO (2300000);
CREATE TABLE qt_al_90k_rec_id_24 PARTITION OF qt_al_90k FOR VALUES FROM (2300000) TO (2400000);
CREATE TABLE qt_al_90k_rec_id_25 PARTITION OF qt_al_90k FOR VALUES FROM (2400000) TO (2500000);
CREATE TABLE qt_al_90k_rec_id_26 PARTITION OF qt_al_90k FOR VALUES FROM (2500000) TO (2600000);
CREATE TABLE qt_al_90k_rec_id_27 PARTITION OF qt_al_90k FOR VALUES FROM (2600000) TO (3000000);
Run Code Online (Sandbox Code Playgroud)

索引查询:

CREATE INDEX qt_al_90k_user_id_1 ON qt_al_90k_rec_id_1 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_1 ON qt_al_90k_rec_id_1 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_2 ON qt_al_90k_rec_id_2 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_2 ON qt_al_90k_rec_id_2 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_3 ON qt_al_90k_rec_id_3 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_3 ON qt_al_90k_rec_id_3 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_4 ON qt_al_90k_rec_id_4 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_4 ON qt_al_90k_rec_id_4 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_5 ON qt_al_90k_rec_id_5 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_5 ON qt_al_90k_rec_id_5 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_6 ON qt_al_90k_rec_id_6 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_6 ON qt_al_90k_rec_id_6 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_7 ON qt_al_90k_rec_id_7 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_7 ON qt_al_90k_rec_id_7 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_8 ON qt_al_90k_rec_id_8 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_8 ON qt_al_90k_rec_id_8 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_9 ON qt_al_90k_rec_id_9 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_9 ON qt_al_90k_rec_id_9 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_10 ON qt_al_90k_rec_id_10 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_10 ON qt_al_90k_rec_id_10 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_11 ON qt_al_90k_rec_id_11 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_11 ON qt_al_90k_rec_id_11 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_12 ON qt_al_90k_rec_id_12 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_12 ON qt_al_90k_rec_id_12 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_13 ON qt_al_90k_rec_id_13 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_13 ON qt_al_90k_rec_id_13 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_14 ON qt_al_90k_rec_id_14 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_14 ON qt_al_90k_rec_id_14 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_15 ON qt_al_90k_rec_id_15 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_15 ON qt_al_90k_rec_id_15 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_16 ON qt_al_90k_rec_id_16 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_16 ON qt_al_90k_rec_id_16 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_17 ON qt_al_90k_rec_id_17 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_17 ON qt_al_90k_rec_id_17 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_18 ON qt_al_90k_rec_id_18 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_18 ON qt_al_90k_rec_id_18 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_19 ON qt_al_90k_rec_id_19 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_19 ON qt_al_90k_rec_id_19 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_20 ON qt_al_90k_rec_id_20 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_20 ON qt_al_90k_rec_id_20 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_21 ON qt_al_90k_rec_id_21 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_21 ON qt_al_90k_rec_id_21 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_22 ON qt_al_90k_rec_id_22 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_22 ON qt_al_90k_rec_id_22 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_23 ON qt_al_90k_rec_id_23 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_23 ON qt_al_90k_rec_id_23 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_24 ON qt_al_90k_rec_id_24 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_24 ON qt_al_90k_rec_id_24 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_25 ON qt_al_90k_rec_id_25 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_25 ON qt_al_90k_rec_id_25 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_26 ON qt_al_90k_rec_id_26 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_26 ON qt_al_90k_rec_id_26 USING brin(rec_id);
CREATE INDEX qt_al_90k_user_id_27 ON qt_al_90k_rec_id_27 USING brin(user_id);
CREATE INDEX qt_al_90k_rec_27 ON qt_al_90k_rec_id_27 USING brin(rec_id);
Run Code Online (Sandbox Code Playgroud)

这是我的检索查询:

create table test123 as
select user_id,user_text, rec_id, user_seq 
 from qt_al_90k 
 where rec_id in (
  1492,1493,1494,1495,1496,1497,1498,1499,1500,1501) and 
 user_id in (
  37
 );
Run Code Online (Sandbox Code Playgroud)

此查询应返回大约 50 万条记录。但是,这需要超过 11 分钟来创建。当我超过 100 时rec_id,查询永远不会返回。

我有 1000 多个 rec_id 需要处理,但是,我做不到。

如何提高 create table 语句的性能?请帮忙。

编辑

这里是 EXPLAIN (ANALYZE, BUFFERS)

"Gather  (cost=1607.98..2817973.44 rows=257182 width=29) (actual time=119.210..344297.379 rows=500000 loops=1)"
"  Workers Planned: 2"
"  Workers Launched: 2"
"  Buffers: shared hit=49963 read=43384"
"  I/O Timings: read=509470.717"
"  ->  Parallel Append  (cost=607.98..2791255.24 rows=107159 width=29) (actual time=116.662..343151.672 rows=166667 loops=3)"
"        Buffers: shared hit=49963 read=43384"
"        I/O Timings: read=509470.717"
"        ->  Parallel Bitmap Heap Scan on qt_al_90k_rec_id_1  (cost=607.98..2790719.45 rows=107159 width=29) (actual time=116.661..343137.730 rows=166667 loops=3)"
"              Recheck Cond: ((rec_id = ANY ('{1492,1493,1494,1495,1496,1497,1498,1499,1500,1501}'::integer[])) AND (va_id = 37))"
"              Rows Removed by Index Recheck: 3862086"
"              Heap Blocks: lossy=30808"
"              Buffers: shared hit=49963 read=43384"
"              I/O Timings: read=509470.717"
"              ->  BitmapAnd  (cost=607.98..607.98 rows=4480328 width=0) (actual time=92.004..92.004 rows=0 loops=1)"
"                    Buffers: shared hit=395 read=24"
"                    I/O Timings: read=0.082"
"                    ->  Bitmap Index Scan on qt_al_90k_rec_1  (cost=0.00..116.02 rows=5232168 width=0) (actual time=47.230..47.230 rows=8016640 loops=1)"
"                          Index Cond: (rec_id = ANY ('{1492,1493,1494,1495,1496,1497,1498,1499,1500,1501}'::integer[]))"
"                          Buffers: shared hit=381 read=1"
"                          I/O Timings: read=0.011"
"                    ->  Bitmap Index Scan on qt_al_90k_user_id_1  (cost=0.00..363.11 rows=134190580 width=0) (actual time=44.130..44.130 rows=11761920 loops=1)"
"                          Index Cond: (va_id = 37)"
"                          Buffers: shared hit=14 read=23"
"                          I/O Timings: read=0.071"
"Planning Time: 1.814 ms"
"Execution Time: 344320.891 ms"
Run Code Online (Sandbox Code Playgroud)

jja*_*nes 2

行的物理顺序是否与 user_id(或 va_id,无论列的真实名称是什么)和 rec_id 列的值密切相关?如果没有,那么您拥有的 BRIN 索引几乎毫无用处。您可以使用 BTREE 索引重试。或者也许更好,在(va_id, rec_id). 出于测试目的,您可以仅在分区 qt_al_90k_rec_id_1 上构建它。

您的数据缓存不佳。但是,这是因为它无法很好地缓存(您没有足够的内存)还是因为它恰好不在您运行查询的那一刻?由于大部分时间都是在读取表数据,因此更有效地使用索引可能没有多大帮助,因为它仍然需要读取表数据(除非您也可以通过添加其他正在选择的列来获得仅索引扫描)

你们的 IO 系统是什么样的?如果您有 RAID 或 JBOD,那么增加 effective_io_concurrency 可能会有帮助。

我有超过 1000 个 rec_id 需要处理

它们分别属于多少个不同的分区?

很高兴看到EXPLAIN这一点。因为它永远不会结束,所以你不能这样做EXPLAIN (ANALYZE)

另外,您需要多久执行一次此操作?这看起来不像您经常运行的查询类型。仅仅为了它而建立索引可能是没有必要的,但更好的索引可能对其他查询也有用。