Chr*_*ian 6 postgresql performance best-practices postgresql-performance
我有这个函数返回一组记录,我需要将这些记录保存到表中。我必须每天做一百次。
我最初的方法只是清除表中的数据并再次重新插入所有记录。
-- CLEAR MY TABLE
DELETE FROM MY_TABLE;
-- POPULATE MY TABLE WITH MY FUNCTION'S RESULT
INSERT INTO MY_TABLE (COLUMN1, COLUMN2, COLUMN3)
SELECT COLUMN1, COLUMN2, COLUMN3
FROM MY_FUNCTION(PARAM1, PARAM2, PARAM3);
Run Code Online (Sandbox Code Playgroud)
到现在为止还挺好。但是我的表有很多触发器,当函数返回数千条记录时,这种方法效率很低。
然后,我转向了这种方法:
-- CREATE A TEMPORARY TABLE
CREATE GLOBAL TEMPORARY TABLE MY_TEMP_TABLE
(COLUMN1 TEXT, COLUMN2 TEXT, COLUMN3 TEXT);
-- POPULATE MY TEMP TABLE WITH MY FUNCTION'S RESULT
INSERT INTO MY_TEMP_TABLE (COLUMN1, COLUMN2, COLUMN3)
SELECT COLUMN1, COLUMN2, COLUMN3
FROM MY_FUNCTION(PARAM1, PARAM2, PARAM3);
-- CREATE AN INDEX FOR HELP PERFORMANCE
CREATE INDEX MY_TEMP_TABLE_INDEX ON MY_TEMP_TABLE (COLUMN1, COLUMN2, COLUMN3);
-- DELETE FROM MY TABLE WHERE NOT EXISTS IN MY TEMP TABLE
DELETE FROM MY_TABLE T
WHERE NOT EXISTS (SELECT 1
FROM MY_TEMP_TABLE T2
WHERE T2.COLUNN1 = T.COLUMN1);
-- UPDATE MY TABLE WHERE COLUMNS ARE DIFFERENT IN MY TEMP TABLE
UPDATE MY_TABLE T
SET COLUMN2 = T2.COLUMN2,
COLUMN3 = T2.COLUMN3
FROM MY_TEMP_TABLE T2
WHERE T2.COLUNN1 = T.COLUMN1
AND (T2.COLUMN2 <> T.COLUMN2 OR T2.COLUMN3 <> T.COLUMN3);
-- INSERT INTO MY TABLE WHER EXISTS IN MY TEMP TABLE
INSERT INTO FROM MY_TABLE T (COLUMN1, COLUMN2, COLUMN3)
(SELECT COLUMN1, COLUMN2, COLUMN3
FROM MY_TEMP_TABLE T2
WHERE NOT EXISTS (SELECT 1 FROM TABLE T3 WHERE T3.COLUNN1 = T2.COLUMN1);
Run Code Online (Sandbox Code Playgroud)
但我仍然遇到性能问题。我相信创建这个 temp_table 会消耗很多资源。此外,我认为这不是最好的方法。
你们能建议另一种方法吗?或者你认为这是最好的方法?
编辑:
为了进行测试,您可以运行上面的这些脚本:
这是创建表/触发器/函数/等的脚本...
-- THIS TABLE CONTAINS INFORMATION THAT USERS NEED
CREATE TABLE USER_INFO (USER_ID TEXT, INFO_ID TEXT, INFO1 TEXT, INFO2 TEXT, INFO3 TEXT, INFO4 TEXT, INFO5 TEXT);
ALTER TABLE USER_INFO ADD CONSTRAINT USER_INFO_PK PRIMARY KEY (USER_ID, INFO_ID);
-- THIS TABLE CONTAINS A KIND OF FLAG, INDICATING FOR USERS THEIR INFORMATION HAS BEEN "REFRESHED" AND THEY SHOULD GET ROWS FROM "USER_INFO"
CREATE TABLE USER_HAS_NEW_INFO (USER_ID TEXT, INFO_DATE TIMESTAMP);
ALTER TABLE USER_HAS_NEW_INFO ADD CONSTRAINT USER_HAS_NEW_INFO_PK PRIMARY KEY (USER_ID, INFO_DATE);
-- CREATE TRIGGER FUNCTION
CREATE OR REPLACE FUNCTION TF_USER_INFO()
RETURNS trigger AS
$BODY$
begin
-- IF SOME INFO HAS CHANGED
if (TG_OP = 'INSERT')
OR
(
(TG_OP = 'UPDATE')
AND
(
(COALESCE(NEW.INFO1,'') <> COALESCE(OLD.INFO1,'')) OR
(COALESCE(NEW.INFO2,'') <> COALESCE(OLD.INFO2,'')) OR
(COALESCE(NEW.INFO3,'') <> COALESCE(OLD.INFO3,'')) OR
(COALESCE(NEW.INFO4,'') <> COALESCE(OLD.INFO4,'')) OR
(COALESCE(NEW.INFO5,'') <> COALESCE(OLD.INFO5,''))
)
)
then
-- INSERT A NEW ROW INTO USER_HAS_NEW_INFO
INSERT INTO USER_HAS_NEW_INFO (USER_ID, INFO_DATE)
SELECT NEW.USER_ID, CURRENT_TIMESTAMP
WHERE NOT EXISTS (SELECT 1
FROM USER_HAS_NEW_INFO
WHERE USER_ID = NEW.USER_ID
AND INFO_DATE = CURRENT_TIMESTAMP
);
end if;
RETURN NEW;
end;
$BODY$
LANGUAGE plpgsql VOLATILE;
-- CREATE TRIGGER
CREATE TRIGGER T_USER_INFO
AFTER INSERT OR UPDATE OR DELETE
ON USER_INFO
FOR EACH ROW
EXECUTE PROCEDURE TF_USER_INFO();
CREATE OR REPLACE FUNCTION CALCULATE_USERS_INFO()
RETURNS SETOF USER_INFO AS
$BODY$
DECLARE
vUSER_INFO USER_INFO%rowtype;
BEGIN
-- HERE GOES A COMPLEX QUERY PLUS SOME CALCS AND VALIDATIONS
-- BUT, FOR TESTING PORPOUSES, WE CAN DO FOLLOWING:
FOR vUSER_INFO IN
SELECT USER_ID,
INFO_ID,
'A=' || TRUNC(RANDOM() * 1000) || '|' ||
'B=' || TRUNC(RANDOM() * 1000) || '|' ||
'C=' || TRUNC(RANDOM() * 1000) AS INFO1,
'A=' || TRUNC(RANDOM() * 1000) || '|' ||
'B=' || TRUNC(RANDOM() * 1000) || '|' ||
'C=' || TRUNC(RANDOM() * 1000) AS INFO2,
'A=' || TRUNC(RANDOM() * 1000) || '|' ||
'B=' || TRUNC(RANDOM() * 1000) || '|' ||
'C=' || TRUNC(RANDOM() * 1000) AS INFO3,
'A=' || TRUNC(RANDOM() * 1000) || '|' ||
'B=' || TRUNC(RANDOM() * 1000) || '|' ||
'C=' || TRUNC(RANDOM() * 1000) AS INFO4,
'A=' || TRUNC(RANDOM() * 1000) || '|' ||
'B=' || TRUNC(RANDOM() * 1000) || '|' ||
'C=' || TRUNC(RANDOM() * 1000) AS INFO5
FROM GENERATE_SERIES(1,1500) AS USER_ID
CROSS JOIN GENERATE_SERIES(1,500) AS INFO_ID
LOOP
RETURN NEXT vUSER_INFO;
END LOOP;
END;
$BODY$
LANGUAGE plpgsql VOLATILE;
Run Code Online (Sandbox Code Playgroud)
这是我每天运行多次的脚本:
-- CREATE A TEMPORARY TABLE
CREATE GLOBAL TEMPORARY TABLE USER_INFO_TEMP
(USER_ID TEXT, INFO_ID TEXT, INFO1 TEXT, INFO2 TEXT, INFO3 TEXT, INFO4 TEXT, INFO5 TEXT)
ON COMMIT DROP;
-- POPULATE MY TEMP TABLE WITH MY FUNCTION'S RESULT
INSERT INTO USER_INFO_TEMP (USER_ID, INFO_ID, INFO1, INFO2, INFO3, INFO4, INFO5)
SELECT USER_ID, INFO_ID, INFO1, INFO2, INFO3, INFO4, INFO5
FROM CALCULATE_USERS_INFO();
-- CREATE AN INDEX FOR HELP PERFORMANCE
CREATE INDEX USER_INFO_TEMP_INDEX ON USER_INFO_TEMP (USER_ID, INFO_ID);
-- DELETE FROM MY TABLE WHERE NOT EXISTS IN MY TEMP TABLE
DELETE FROM USER_INFO T
WHERE NOT EXISTS (SELECT 1
FROM USER_INFO_TEMP T2
WHERE T2.USER_ID = T.USER_ID
AND T2.INFO_ID = T.INFO_ID);
-- UPDATE MY TABLE WHERE COLUMNS ARE DIFFERENT IN MY TEMP TABLE
UPDATE USER_INFO T
SET INFO1 = T2.INFO1,
INFO2 = T2.INFO2,
INFO3 = T2.INFO3,
INFO4 = T2.INFO4,
INFO5 = T2.INFO5
FROM USER_INFO_TEMP T2
WHERE T2.USER_ID = T.USER_ID
AND T2.INFO_ID = T.INFO_ID
AND (T2.INFO1 <> T.INFO1 OR
T2.INFO2 <> T.INFO2 OR
T2.INFO3 <> T.INFO3 OR
T2.INFO4 <> T.INFO4 OR
T2.INFO5 <> T.INFO5
);
-- INSERT INTO TABLE WHERE EXISTS IN TEMP AND NOT EXISTS IN TABLE
INSERT INTO USER_INFO (USER_ID, INFO_ID, INFO1, INFO2, INFO3, INFO4, INFO5)
(SELECT USER_ID, INFO_ID, INFO1, INFO2, INFO3, INFO4, INFO5
FROM USER_INFO_TEMP T2
WHERE NOT EXISTS (SELECT 1
FROM USER_INFO T3
WHERE T3.USER_ID = T2.USER_ID
AND T3.INFO_ID = T2.INFO_ID
)
);
Run Code Online (Sandbox Code Playgroud)
这在很大程度上取决于基数。
新旧表各有多少行?有多少导致DELETE
/ UPDATE
/ INSERT
?
TRUNCATE
最快通常,如果表的大部分更改,TRUNCATE
/ INSERT
from 函数可能是最快的方法。如果在同一个事务中完成,Postgres 不需要编写 WAL(因为无论如何我们都是从头开始的)。此外,您会得到一个没有膨胀的原始表,这对这个过程的下一次迭代产生积极影响。对于大表删除和重新创建索引等。此相关答案中的详细信息:
其余的仅适用于出于某种原因要保留现有行的情况。
如果触发器妨碍了(正如你所写的,我不相信,但让我们现在假设)。或者,如果您在表中有其他不能丢失的列。
取决于更改集中有多少行(从函数返回)...
小于~1000,这取决于很多因素。一种数据改性CTE(同时具有自动,价格便宜,内部温度为函数的结果表)可能最快的是:
WITH x AS (SELECT * FROM calculate_users_info())
, del AS (
DELETE FROM user_info t
WHERE NOT EXISTS (
SELECT 1 FROM x
WHERE user_id = t.user_id
AND info_id = t.info_id
)
, upd AS (
UPDATE user_info t
SET (info1, info2, info3, info4, info5)
= (x.info1, x.info2, x.info3, x.info4, x.info5)
FROM x
WHERE x.user_id = t.user_id
AND x.info_id = t.info_id
AND (x.info1 <> t.info1 OR
x.info2 <> t.info2 OR
x.info3 <> t.info3 OR
x.info4 <> t.info4 OR
x.info5 <> t.info5)
)
INSERT INTO user_info
(user_id, info_id, info1, info2, info3, info4, info5)
SELECT user_id, info_id, info1, info2, info3, info4, info5
FROM x
WHERE NOT EXISTS (
SELECT 1
FROM user_info t3
WHERE t3.user_id = t2.user_id
AND t3.info_id = t2.info_id
)
;
Run Code Online (Sandbox Code Playgroud)
在这种情况下,在临时表上构建的脚本看起来基本没问题。主要优点是临时表上的索引 - 由于缺少统计信息。
我还有其他一些建议:
CREATE TEMP TABLE user_info_tmp ON COMMIT DROP AS -- directly from SELECT
SELECT * FROM calculate_users_info();
CREATE INDEX user_info_tmp_idx ON user_info_tmp (user_id, info_id);
ANALYZE user_info_tmp; -- !!!
DELETE FROM user_info t -- with EXISTS semi-anti-join
WHERE NOT EXISTS (
SELECT 1 FROM user_info_tmp
WHERE user_id = t.user_id
AND info_id = t.info_id
);
ANALYZE user_info; -- only if large parts have been removed
UPDATE user_info t -- with short syntax
SET (info1, info2, info3, info4, info5)
= (x.info1, x.info2, x.info3, x.info4, x.info5) -- shorter, not faster
FROM user_info_tmp x
WHERE x.user_id = t.user_id
AND x.info_id = t.info_id
AND (x.info1 <> t.info1 OR x.info2 <> t.info2 OR x.info3 <> t.info3
OR x.info4 <> t.info4 OR x.info5 <> t.info5);
INSERT INTO user_info -- with join syntax
(user_id, info_id, info1, info2, info3, info4, info5)
SELECT user_id, info_id, info1, info2, info3, info4, info5
FROM user_info_tmp x
LEFT JOIN user_info u USING (user_id, info_id)
WHERE u.user_id IS NULL; -- shorter, maybe faster
Run Code Online (Sandbox Code Playgroud)
临时表不会自动分析。此外,在同一事务中创建并立即使用的表通常不会给出autovacuum
有机会启动。详细信息:
在 9.1 下仍然推荐常规的 VACUUM ANALYZE 吗?
由于这两个原因,您需要ANALYZE
手动在桌子上运行,最好在我放置它的时间线上。这避免了严重误导的查询计划。次要的额外优化:您可以将不相关列的统计目标设置为 0 - 除(user_id, info_id)
示例中。
不要GLOBAL
用于临时表。根据文档:
GLOBAL
或者LOCAL
忽略兼容性。不推荐使用这些关键字;
CREATE TABLE
详情请参阅。
您可以根据函数结果自动创建临时表。代码更短,速度也更快。
考虑更短的语法变体,它们主要是缩短代码并且不会对性能产生太大影响。
work_mem
/ temp_bufers
无论哪种方式,您都需要足够的 RAM 才能使其快速运行。与小基数几乎无关,但对大表至关重要。在一个半体面的设置中,数千个不应触及任何内存限制。如需更多,请务必work_mem
为 CTE 或temp_bufers
临时表分配足够的空间。相关回答:
归档时间: |
|
查看次数: |
21024 次 |
最近记录: |