想象一下,我在 Postgres 13 中有一个像这样的表:
CREATE TABLE public.people (
id integer PRIMARY KEY,
full_name character varying(255),
bio text
);
Run Code Online (Sandbox Code Playgroud)
然后,我插入一行,其中包含足够的字符,以便将 Bio 写入 TOAST 表(4000 个随机字节,应压缩到 > 2Kb):
# insert into people values (1, 'joe toast', (SELECT array_to_string(ARRAY(SELECT chr((65 + round(random() * 25)) :: integer) FROM generate_series(1,4000)), '')));
INSERT 0 1
Run Code Online (Sandbox Code Playgroud)
然后插入一行,其中包含足够的字符用于 Bio Fit 内联(3000 个重复字节,应压缩至 < 2Kb):
# insert into people values (2, 'joe compressed', (SELECT array_to_string(ARRAY(SELECT chr(65) FROM generate_series(1,3000)), '')));
INSERT 0 1
Run Code Online (Sandbox Code Playgroud)
最后在简介中插入一行仅包含几个字符的行,以便它将存储内联(10 个重复字节):
# insert into people values (3, 'joe inline', 'aaaaaaaaaa');
INSERT 0 1
Run Code Online (Sandbox Code Playgroud)
有什么方法可以让我检测每个元组中bio的存储策略吗?我可以报告内联或 TOAST 中行的百分比(“22% 的元组存储内联生物,78% 在 TOAST 中”)吗?
一个相关的问题:我是否可以知道磁盘上按内联、内联压缩和 TOAST 存储分解的元组的字节数?
上下文:我正在使用一个总计超过 10 亿行的分区表,并且我有兴趣了解特定列内联存储与 TOAST 中存储的频率。
我可以获取每个生物在磁盘上的大小,在一种情况下,它显然是内联压缩的大小:
# select id, full_name, pg_column_size(bio) from people order by id;
id | full_name | pg_column_size
----+----------------+----------------
1 | joe toast | 4000
2 | joe compressed | 44
3 | joe inline | 11
(3 rows)
Run Code Online (Sandbox Code Playgroud)
将该大小与未压缩数据的大小进行比较可以告诉我们有关压缩的一些信息,但是它可以告诉我们有关 TOAST 状态的任何信息吗?
# select id, full_name, pg_column_size(bio), length(bio) from people order by id;
id | full_name | pg_column_size | length
----+----------------+----------------+--------
1 | joe toast | 4000 | 4000
2 | joe compressed | 44 | 3000
3 | joe inline | 11 | 10
Run Code Online (Sandbox Code Playgroud)
我可以手动检查 TOAST 表中的一些行:
# select relname from pg_class where oid = (select reltoastrelid from pg_class where relname='people');
relname
----------------
pg_toast_20138
# select chunk_id, sum(length(chunk_data)) from pg_toast.pg_toast_20138 group by chunk_id;
chunk_id | sum
----------+------
20149 | 4000
Run Code Online (Sandbox Code Playgroud)
在一般情况下,以下说法正确吗?
# select id, full_name, pg_column_size(bio), length(bio),
case
when pg_column_size(bio) < length(bio) then 'inline-compressed'
when pg_column_size(bio) = length(bio) then 'toast'
else
'inline'
end as storage_strategy
from people order by id;
id | full_name | pg_column_size | length | storage_strategy
----+----------------+----------------+--------+-------------------
1 | joe toast | 4000 | 4000 | toast
2 | joe compressed | 44 | 3000 | inline-compressed
3 | joe inline | 11 | 10 | inline
Run Code Online (Sandbox Code Playgroud)
out_of_line
表示数据存储在TOAST中bytes_on_disk
并且uncompressed_bytes
可能包含一些元数据长度(1 或 4 字节),需要有一天对其进行完善。inner join people
,如果您想查看不可见的行(例如已删除但尚未清理),请使用left join people
+--+--------------+------------------+----------+-----------+-------------+
|id|full_name |uncompressed_bytes|compressed|out_of_line|bytes_on_disk|
+--+--------------+------------------+----------+-----------+-------------+
|1 |joe toast |4004 |false |true |4000 |
|2 |joe compressed|3000 |true |false |44 |
|3 |joe inline |10 |false |false |11 |
Run Code Online (Sandbox Code Playgroud)
首先打开pageinspect
检查并创建函数以从列元数据中获取信息:
create extension pageinspect;
create or replace function is_toasted(datum_header bytea) returns bool as $$ begin
return get_byte(datum_header, 0) = 1;
end; $$ language plpgsql;
create or replace function is_1b_meta(datum_header bytea) returns bool as $$ begin
return not is_toasted(datum_header) and get_byte(datum_header, 0) & 1 > 0;
end; $$ LANGUAGE plpgsql;
create or replace function is_compressed(datum_header bytea) returns bool as $$ begin
if(is_1b_meta(datum_header)) then
return false;
elsif(not is_toasted(datum_header)) then
return get_byte(datum_header, 0) & 2 > 0;
else
return bytes_on_disk(datum_header)+4 != toasted_original_len(datum_header);
end if;
end; $$ LANGUAGE plpgsql;
create or replace function meta_len(datum_header bytea) returns int as $$begin
if is_1b_meta(datum_header) then return 1;
else return 4;
end if;
end;$$ language plpgsql;
create or replace function bytes_on_disk(datum_header bytea) returns int language plpgsql as $$begin
if(is_1b_meta(datum_header)) then
return get_byte(datum_header, 0) >> 1;
elsif(not is_toasted(datum_header)) then
return (get_byte(datum_header, 0) >> 2)
| (get_byte(datum_header, 1) << 6)
| (get_byte(datum_header, 2) << 14)
| (get_byte(datum_header, 3) << 22);
else
return get_byte(datum_header, 6)
| (get_byte(datum_header, 7) << 8)
| (get_byte(datum_header, 8) << 16)
| (get_byte(datum_header, 9) << 24);
end if;
end;$$;
create or replace function toasted_original_len(datum_header bytea) returns integer language plpgsql as $$ begin
if(not is_toasted(datum_header)) then
return get_byte(datum_header, 0) >> 1;--not needed anymore
else
return get_byte(datum_header, 2)
| (get_byte(datum_header, 3) << 8)
| (get_byte(datum_header, 4) << 16)
| (get_byte(datum_header, 5) << 24);
end if;
end;$$;
create or replace function meta_bits(datum_header bytea) returns bit as $$
declare
len int;
i int;
res bit varying(32);
begin
i = 0;
res = '';
len = meta_len(datum_header);
while i < len loop
res = res || get_byte(datum_header, i)::bit(8);
i = i+1;
end loop;
return res;
end; $$ language plpgsql;
Run Code Online (Sandbox Code Playgroud)
现在您可以选择某些列([3]
表示第 3 列),获取二进制数据并解析标题:
with bits as(
select t_ctid as ctid,
(tuple_data_split('people'::regclass, t_data, t_infomask, t_infomask2, t_bits))[3] as bits
from generate_series(0, (select max((ctid::text::point)[0]::int) from people)) as page,
lateral heap_page_items(get_raw_page('people', page))
)
select p.id, p.full_name,
case when is_toasted(bits) then toasted_original_len(bits)
else length(p.bio)
end as uncompressed_bytes,
--meta_bits(bits),
is_compressed(bits) compressed, is_toasted(bits) out_of_line,
bytes_on_disk(bits)
from bits
inner join people p on p.ctid=bits.ctid;
Run Code Online (Sandbox Code Playgroud)
该信息由 Postgres 内部存储的内容确定。varlena(可变长度字段)元数据有 3 个选项(代码、文档、演示文稿):