从 Postgres 9.x 中的复杂查询插入/更新/删除表记录的有效方法

Chr*_*ian 6 postgresql performance best-practices postgresql-performance

我有这个函数返回一组记录,我需要将这些记录保存到表中。我必须每天做一百次。

我最初的方法只是清除表中的数据并再次重新插入所有记录。

-- CLEAR MY TABLE
DELETE FROM MY_TABLE;

-- POPULATE MY TABLE WITH MY FUNCTION'S RESULT
INSERT INTO MY_TABLE (COLUMN1, COLUMN2, COLUMN3)
SELECT COLUMN1, COLUMN2, COLUMN3 
FROM MY_FUNCTION(PARAM1, PARAM2, PARAM3);
Run Code Online (Sandbox Code Playgroud)

到现在为止还挺好。但是我的表有很多触发器,当函数返回数千条记录时,这种方法效率很低。

然后,我转向了这种方法:

-- CREATE A TEMPORARY TABLE
CREATE GLOBAL TEMPORARY TABLE MY_TEMP_TABLE 
(COLUMN1 TEXT, COLUMN2 TEXT, COLUMN3 TEXT);

-- POPULATE MY TEMP TABLE WITH MY FUNCTION'S RESULT
INSERT INTO MY_TEMP_TABLE (COLUMN1, COLUMN2, COLUMN3)
SELECT COLUMN1, COLUMN2, COLUMN3 
FROM MY_FUNCTION(PARAM1, PARAM2, PARAM3);

-- CREATE AN INDEX FOR HELP PERFORMANCE
CREATE INDEX MY_TEMP_TABLE_INDEX ON MY_TEMP_TABLE (COLUMN1, COLUMN2, COLUMN3);

-- DELETE FROM MY TABLE WHERE NOT EXISTS IN MY TEMP TABLE
DELETE FROM MY_TABLE T 
WHERE NOT EXISTS (SELECT 1 
                  FROM MY_TEMP_TABLE T2 
                  WHERE T2.COLUNN1 = T.COLUMN1);

-- UPDATE MY TABLE WHERE COLUMNS ARE DIFFERENT IN MY TEMP TABLE
UPDATE MY_TABLE T 
SET COLUMN2 = T2.COLUMN2,
    COLUMN3 = T2.COLUMN3 
FROM MY_TEMP_TABLE T2 
WHERE T2.COLUNN1 = T.COLUMN1
  AND (T2.COLUMN2 <> T.COLUMN2 OR T2.COLUMN3 <> T.COLUMN3);

-- INSERT INTO MY TABLE WHER EXISTS IN MY TEMP TABLE
INSERT INTO FROM MY_TABLE T (COLUMN1, COLUMN2, COLUMN3) 
(SELECT COLUMN1, COLUMN2, COLUMN3
   FROM MY_TEMP_TABLE T2
  WHERE NOT EXISTS (SELECT 1 FROM TABLE T3 WHERE T3.COLUNN1 = T2.COLUMN1);
Run Code Online (Sandbox Code Playgroud)

但我仍然遇到性能问题。我相信创建这个 temp_table 会消耗很多资源。此外,我认为这不是最好的方法。

你们能建议另一种方法吗?或者你认为这是最好的方法?

编辑:

为了进行测试,您可以运行上面的这些脚本:

这是创建表/触发器/函数/等的脚本...

-- THIS TABLE CONTAINS INFORMATION THAT USERS NEED
CREATE TABLE USER_INFO (USER_ID TEXT, INFO_ID TEXT, INFO1 TEXT, INFO2 TEXT, INFO3 TEXT, INFO4 TEXT, INFO5 TEXT);
ALTER TABLE USER_INFO ADD CONSTRAINT USER_INFO_PK PRIMARY KEY (USER_ID, INFO_ID);

-- THIS TABLE CONTAINS A KIND OF FLAG, INDICATING FOR USERS THEIR INFORMATION HAS BEEN "REFRESHED" AND THEY SHOULD GET ROWS FROM "USER_INFO"
CREATE TABLE USER_HAS_NEW_INFO (USER_ID TEXT, INFO_DATE TIMESTAMP);
ALTER TABLE USER_HAS_NEW_INFO ADD CONSTRAINT USER_HAS_NEW_INFO_PK PRIMARY KEY (USER_ID, INFO_DATE);


-- CREATE TRIGGER FUNCTION 
CREATE OR REPLACE FUNCTION TF_USER_INFO()
  RETURNS trigger AS
$BODY$
begin

  -- IF SOME INFO HAS CHANGED 

  if (TG_OP = 'INSERT') 
     OR 
     (
       (TG_OP = 'UPDATE') 
       AND
       (
         (COALESCE(NEW.INFO1,'') <> COALESCE(OLD.INFO1,'')) OR
         (COALESCE(NEW.INFO2,'') <> COALESCE(OLD.INFO2,'')) OR
         (COALESCE(NEW.INFO3,'') <> COALESCE(OLD.INFO3,'')) OR
         (COALESCE(NEW.INFO4,'') <> COALESCE(OLD.INFO4,'')) OR
         (COALESCE(NEW.INFO5,'') <> COALESCE(OLD.INFO5,'')) 
       )
     )
  then

    -- INSERT A NEW ROW INTO USER_HAS_NEW_INFO 
    INSERT INTO USER_HAS_NEW_INFO (USER_ID, INFO_DATE)
    SELECT NEW.USER_ID, CURRENT_TIMESTAMP
    WHERE  NOT EXISTS (SELECT 1 
                         FROM USER_HAS_NEW_INFO 
                        WHERE USER_ID = NEW.USER_ID
                          AND INFO_DATE = CURRENT_TIMESTAMP
                      );
  end if;

  RETURN NEW;
end;
$BODY$
  LANGUAGE plpgsql VOLATILE;



-- CREATE TRIGGER
CREATE TRIGGER T_USER_INFO
  AFTER INSERT OR UPDATE OR DELETE
  ON USER_INFO
  FOR EACH ROW
  EXECUTE PROCEDURE TF_USER_INFO();




CREATE OR REPLACE FUNCTION CALCULATE_USERS_INFO()
RETURNS SETOF USER_INFO AS
$BODY$
DECLARE
    vUSER_INFO          USER_INFO%rowtype;
BEGIN

    -- HERE GOES A COMPLEX QUERY PLUS SOME CALCS AND VALIDATIONS
    -- BUT, FOR TESTING PORPOUSES, WE CAN DO FOLLOWING:


    FOR vUSER_INFO IN   
            SELECT USER_ID,
                   INFO_ID,
                   'A=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'B=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'C=' || TRUNC(RANDOM() * 1000) AS INFO1,

                   'A=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'B=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'C=' || TRUNC(RANDOM() * 1000) AS INFO2,

                   'A=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'B=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'C=' || TRUNC(RANDOM() * 1000) AS INFO3,

                   'A=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'B=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'C=' || TRUNC(RANDOM() * 1000) AS INFO4,

                   'A=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'B=' || TRUNC(RANDOM() * 1000) || '|' || 
                   'C=' || TRUNC(RANDOM() * 1000) AS INFO5


              FROM GENERATE_SERIES(1,1500) AS USER_ID
              CROSS JOIN GENERATE_SERIES(1,500) AS INFO_ID

    LOOP
        RETURN NEXT vUSER_INFO;
    END LOOP;

END;
$BODY$
  LANGUAGE plpgsql VOLATILE;
Run Code Online (Sandbox Code Playgroud)

这是我每天运行多次的脚本:

-- CREATE A TEMPORARY TABLE
CREATE GLOBAL TEMPORARY TABLE USER_INFO_TEMP 
(USER_ID TEXT, INFO_ID TEXT, INFO1 TEXT, INFO2 TEXT, INFO3 TEXT, INFO4 TEXT, INFO5 TEXT)
ON COMMIT DROP;

-- POPULATE MY TEMP TABLE WITH MY FUNCTION'S RESULT
INSERT INTO USER_INFO_TEMP (USER_ID, INFO_ID, INFO1, INFO2, INFO3, INFO4, INFO5)
SELECT USER_ID, INFO_ID, INFO1, INFO2, INFO3, INFO4, INFO5 
FROM CALCULATE_USERS_INFO();

-- CREATE AN INDEX FOR HELP PERFORMANCE
CREATE INDEX USER_INFO_TEMP_INDEX ON USER_INFO_TEMP (USER_ID, INFO_ID);

-- DELETE FROM MY TABLE WHERE NOT EXISTS IN MY TEMP TABLE
DELETE FROM USER_INFO T 
WHERE NOT EXISTS (SELECT 1 
                  FROM USER_INFO_TEMP T2 
                  WHERE T2.USER_ID = T.USER_ID
                  AND   T2.INFO_ID = T.INFO_ID);

-- UPDATE MY TABLE WHERE COLUMNS ARE DIFFERENT IN MY TEMP TABLE
UPDATE USER_INFO T 
SET INFO1 = T2.INFO1,
    INFO2 = T2.INFO2,
    INFO3 = T2.INFO3,
    INFO4 = T2.INFO4,
    INFO5 = T2.INFO5
FROM USER_INFO_TEMP T2 
WHERE T2.USER_ID = T.USER_ID
  AND T2.INFO_ID = T.INFO_ID
  AND (T2.INFO1 <> T.INFO1 OR 
       T2.INFO2 <> T.INFO2 OR
       T2.INFO3 <> T.INFO3 OR
       T2.INFO4 <> T.INFO4 OR
       T2.INFO5 <> T.INFO5
      );

-- INSERT INTO TABLE WHERE EXISTS IN TEMP AND NOT EXISTS IN TABLE
INSERT INTO USER_INFO (USER_ID, INFO_ID, INFO1, INFO2, INFO3, INFO4, INFO5) 
(SELECT USER_ID, INFO_ID, INFO1, INFO2, INFO3, INFO4, INFO5
   FROM USER_INFO_TEMP T2
  WHERE NOT EXISTS (SELECT 1 
                      FROM USER_INFO T3
                     WHERE T3.USER_ID = T2.USER_ID 
                       AND T3.INFO_ID = T2.INFO_ID
                   )
);
Run Code Online (Sandbox Code Playgroud)

Erw*_*ter 9

这在很大程度上取决于基数
新旧表各有多少行?有多少导致DELETE/ UPDATE/ INSERT

TRUNCATE 最快

通常,如果表的大部分更改,TRUNCATE/ INSERTfrom 函数可能是最快的方法。如果在同一个事务中完成,Postgres 不需要编写 WAL(因为无论如何我们都是从头开始的)。此外,您会得到一个没有膨胀的原始表,这对这个过程的下一次迭代产生积极影响。对于大表删除和重新创建索引等。此相关答案中的详细信息:


其余的仅适用于出于某种原因要保留现有行的情况。


如果触发器妨碍了(正如你所写的,我不相信,但让我们现在假设)。或者,如果您在表中有其他不能丢失的列。

取决于更改集中有多少行(从函数返回)...

几行

小于~1000,这取决于很多因素。一种数据改性CTE(同时具有自动,价格便宜,内部温度为函数的结果表)可能最快的是:

WITH x AS (SELECT * FROM calculate_users_info())
, del AS (
   DELETE FROM user_info t
   WHERE  NOT EXISTS (
      SELECT 1 FROM x
      WHERE  user_id = t.user_id
      AND    info_id = t.info_id
      )
, upd AS (
    UPDATE user_info t 
    SET   (info1,   info2,   info3,   info4,   info5)
      = (x.info1, x.info2, x.info3, x.info4, x.info5)
    FROM   x
    WHERE  x.user_id = t.user_id
    AND    x.info_id = t.info_id
    AND   (x.info1 <> t.info1 OR 
             x.info2 <> t.info2 OR
             x.info3 <> t.info3 OR
             x.info4 <> t.info4 OR
             x.info5 <> t.info5)
    )
INSERT INTO user_info
      (user_id, info_id, info1, info2, info3, info4, info5) 
SELECT user_id, info_id, info1, info2, info3, info4, info5
FROM   x
WHERE  NOT EXISTS (
    SELECT 1 
    FROM   user_info t3
    WHERE  t3.user_id = t2.user_id 
    AND    t3.info_id = t2.info_id
    )
;
Run Code Online (Sandbox Code Playgroud)

许多行

在这种情况下,在临时表上构建的脚本看起来基本没问题。主要优点是临时表上的索引 - 由于缺少统计信息

我还有其他一些建议:

CREATE TEMP TABLE user_info_tmp ON COMMIT DROP AS  -- directly from SELECT
SELECT * FROM calculate_users_info();

CREATE INDEX user_info_tmp_idx ON user_info_tmp (user_id, info_id);

ANALYZE user_info_tmp;     -- !!!

DELETE FROM user_info t    -- with EXISTS semi-anti-join
WHERE  NOT EXISTS (
   SELECT 1 FROM user_info_tmp
   WHERE  user_id = t.user_id
   AND    info_id = t.info_id
   );

ANALYZE user_info;         -- only if large parts have been removed

UPDATE user_info t         -- with short syntax
SET   (info1,   info2,   info3,   info4,   info5)
  = (x.info1, x.info2, x.info3, x.info4, x.info5)  -- shorter, not faster
FROM   user_info_tmp x 
WHERE  x.user_id = t.user_id
AND    x.info_id = t.info_id
AND   (x.info1 <> t.info1 OR x.info2 <> t.info2 OR x.info3 <> t.info3
    OR x.info4 <> t.info4 OR x.info5 <> t.info5);

INSERT INTO user_info      -- with join syntax
      (user_id, info_id, info1, info2, info3, info4, info5) 
SELECT user_id, info_id, info1, info2, info3, info4, info5
FROM   user_info_tmp x
LEFT   JOIN user_info u USING (user_id, info_id)
WHERE  u.user_id IS NULL;  -- shorter, maybe faster
Run Code Online (Sandbox Code Playgroud)

要点

  • 临时表不会自动分析。此外,在同一事务中创建并立即使用的表通常不会给出autovacuum有机会启动。详细信息:

  • 在 9.1 下仍然推荐常规的 VACUUM ANALYZE 吗?

    由于这两个原因,您需要ANALYZE手动在桌子上运行,最好在我放置它的时间线上。这避免了严重误导的查询计划。次要的额外优化:您可以将不相关列的统计目标设置为 0 - 除(user_id, info_id)示例中。

  • 不要GLOBAL用于临时表。根据文档:

    GLOBAL 或者 LOCAL

    忽略兼容性。不推荐使用这些关键字;CREATE TABLE详情请参阅。

  • 您可以根据函数结果自动创建临时表。代码更短,速度也更快。

  • 考虑更短的语法变体,它们主要是缩短代码并且不会对性能产生太大影响。

work_mem / temp_bufers

无论哪种方式,您都需要足够的 RAM 才能使其快速运行。与小基数几乎无关,但对大表至关重要。在一个半体面的设置中,数千个不应触及任何内存限制。如需更多,请务必work_mem为 CTE 或temp_bufers临时表分配足够的空间。相关回答: