Jac*_*las 5 postgresql clustering postgresql-9.3
CLUSTER大表上的命令可能需要很长时间,并且在运行时会阻止对表的读取和写入。
我不需要按索引顺序对表中的数据进行严格排序,我只希望通常一起查询的行更有可能位于同一个数据库块中,而不是均匀地分散在表中(这是它们的分布)由于日期插入表格的方式的性质,自然有)。
它可以产生很大的不同。在下面的示例中,唯一的区别是insert有一个额外的,order by mod(g,10)以便测试数据由 预先聚类host_id。当获取一个特定host_id.
是否有某种方法可以在没有排它锁和cluster命令的日志记录开销的情况下实现这种集群?
create schema stack;
set search_path=stack;
--
create table foo(host_id integer, bar text default repeat('a',400));
insert into foo(host_id) select mod(g,10) from generate_series(1,500000) g;
create index nu_foo on foo(host_id);
explain analyze select count(bar) from foo where host_id=1;
/*
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=30188.66..30188.67 rows=1 width=404) (actual time=1129.858..1129.859 rows=1 loops=1)
-> Bitmap Heap Scan on foo (cost=919.27..30066.46 rows=48883 width=404) (actual time=253.149..1110.013 rows=50000 loops=1)
Recheck Cond: (host_id = 1)
Rows Removed by Index Recheck: 320257
-> Bitmap Index Scan on nu_foo (cost=0.00..907.04 rows=48883 width=0) (actual time=251.863..251.863 rows=50000 loops=1)
Index Cond: (host_id = 1)
Total runtime: 1129.893 ms
*/
--
drop table foo;
--
create table foo(host_id integer, bar text default repeat('a',400));
insert into foo(host_id) select mod(g,10) from generate_series(1,500000) g order by mod(g,10);
create index nu_foo on foo(host_id);
explain analyze select count(bar) from foo where host_id=1;
/*
QUERY PLAN
-----------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=7550.20..7550.21 rows=1 width=32) (actual time=24.397..24.397 rows=1 loops=1)
-> Bitmap Heap Scan on foo (cost=47.80..7543.95 rows=2500 width=32) (actual time=3.988..16.189 rows=50000 loops=1)
Recheck Cond: (host_id = 1)
-> Bitmap Index Scan on nu_foo (cost=0.00..47.17 rows=2500 width=0) (actual time=3.649..3.649 rows=50000 loops=1)
Index Cond: (host_id = 1)
Total runtime: 24.437 ms
*/
--
drop schema stack cascade;
Run Code Online (Sandbox Code Playgroud)
您可以在不使用该cluster命令并锁定表或为整个表生成 WAL 的情况下执行此操作。代价是您需要定期对表进行全面扫描。
基本思想是:
测试模式样本数据最初是“部分集群”的:
create schema stack;
set search_path=stack;
create type t_tid as (blkno bigint, rowno integer);
create table foo(host_id integer, bar text default repeat('a',400)) with (autovacuum_enabled=false);
insert into foo(host_id) select mod(g,10) from generate_series(1,500000) g order by mod(g,10);
insert into foo(host_id) select mod(g,10) from generate_series(1,500000) g;
create index nu_foo on foo(host_id);
Run Code Online (Sandbox Code Playgroud)
初始聚类统计:
select cn, count(*)
from ( select count(*) cn
from (select distinct (ctid::text::t_tid).blkno, host_id from foo) z
group by blkno ) z
group by cn
order by cn;
/*
cn | count
----+-------
1 | 27769 <---- half clustered
2 | 8
5 | 1
10 | 27778 <---- half un-clustered
*/
select count(distinct (ctid::text::t_tid).blkno) from foo where host_id=1;
/*
count
-------
30558 <--------- lots of blocks to read for `host_id=1`
*/
Run Code Online (Sandbox Code Playgroud)
初始分析(2146.503 毫秒):
explain analyze select count(bar) from foo where host_id=1;
/*
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=15097.30..15097.31 rows=1 width=32) (actual time=2146.157..2146.158 rows=1 loops=1)
-> Bitmap Heap Scan on foo (cost=95.17..15084.80 rows=5000 width=32) (actual time=21.586..2092.379 rows=100000 loops=1)
Recheck Cond: (host_id = 1)
Rows Removed by Index Recheck: 286610
-> Bitmap Index Scan on nu_foo (cost=0.00..93.92 rows=5000 width=0) (actual time=19.232..19.232 rows=100000 loops=1)
Index Cond: (host_id = 1)
Total runtime: 2146.503 ms
*/
Run Code Online (Sandbox Code Playgroud)
删除并重新插入未聚集的行:
with w as ( select blkno
from (select distinct (ctid::text::t_tid).blkno, host_id from foo) z
group by blkno
having count(*)>2 )
, d as ( delete from foo
where (ctid::text::t_tid).blkno in (select blkno from w)
returning * )
insert into foo(host_id,bar) select host_id,bar from d order by host_id;
--
vacuum foo;
Run Code Online (Sandbox Code Playgroud)
新的聚类统计:
select cn, count(*)
from ( select count(*) cn
from (select distinct (ctid::text::t_tid).blkno, host_id from foo) z
group by blkno ) z
group by cn
order by cn;
/*
cn | count
----+-------
1 | 55541 <---- fully clustered
2 | 16
*/
select count(distinct (ctid::text::t_tid).blkno) from foo where host_id=1;
/*
count
-------
5558 <--------- far fewer blocks to read for `host_id=1`
*/
Run Code Online (Sandbox Code Playgroud)
新分析(48.804 毫秒):
explain analyze select count(bar) from foo where host_id=1;
/*
QUERY PLAN
-------------------------------------------------------------------------------------------------------------------------------
Aggregate (cost=16110.64..16110.65 rows=1 width=32) (actual time=48.760..48.761 rows=1 loops=1)
-> Bitmap Heap Scan on foo (cost=131.18..16098.14 rows=5000 width=32) (actual time=8.402..32.439 rows=100000 loops=1)
Recheck Cond: (host_id = 1)
-> Bitmap Index Scan on nu_foo (cost=0.00..129.93 rows=5000 width=0) (actual time=7.636..7.636 rows=100000 loops=1)
Index Cond: (host_id = 1)
Total runtime: 48.804 ms
*/
Run Code Online (Sandbox Code Playgroud)
清理:
drop schema stack cascade;
Run Code Online (Sandbox Code Playgroud)
以上现在是可行的,但有点古怪(需要关闭表格的自动真空)并且需要定期全面扫描表格。我认为可以在 postgres 中构建类似的没有缺点的东西。你需要:
| 归档时间: |
|
| 查看次数: |
986 次 |
| 最近记录: |