在Python中使用序列匹配器查找最长公共字符串

sya*_*yam 3 python sequence

我试图difflib.SequenceMatcher在Python中使用返回最大的公共字符串

string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""

string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44 VMs [u'vm_353ca5', u'vm_e02d7f'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type 'exceptions.SystemExit'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""

match = SequenceMatcher(None, string1, string2).find_longest_match(0, len(string1), 0, len(string2))
print match
print(string1[match.a: match.a + match.size])

string1="""ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44,"""
string2="""ERROR agave_util.py:64 Timed out waiting for VMs [u'vm_353ca5', u'vm_e02d7f'] power on CRITICAL ha_test_util.py:44"""
match = SequenceMatcher(None, string1, string2).find_longest_match(0,    len(string1), 0, len(string2))
print(string1[match.a: match.a + match.size])
Run Code Online (Sandbox Code Playgroud)

所以基本上在比较string1and string2 [前两行]期间返回CRITICAL ha_test_util.py:44,而当我从string1and string2[第 6 行和第 7 行] 中剪切一些行时,它返回 ERROR agave_util.py:64 Timed out waiting for

基本上我的问题是为什么序列匹配器在我的第一个案例中没有返回正确的匹配项?

ran*_*mir 5

您正在经历(在您的情况下是负面的)SequenceMatcher自动垃圾启发式的影响。来自文档

\n\n
\n

自动垃圾启发式SequenceMatcher支持自动将某些序列项视为垃圾的启发式。启发式计算每个单独项目在序列中出现的次数。如果某个项目\xe2\x80\x99s 重复项(在第一个之后)占序列的 1% 以上,并且序列长度至少为 200 个项目,则该项目被标记为 \xe2\x80\x9cpopular\xe2\x80 \x9d 并被视为垃圾以进行序列匹配。可以通过autojunkFalse创建SequenceMatcher.

\n
\n\n

SequenceMatcher构造函数中,autojunk默认为True. 如果您尝试使用autojunk=False,您将获得预期的最长匹配:

\n\n
from difflib import SequenceMatcher\n\nstring1 = """ERROR agave_util.py:64 Timed out waiting for HA alert generated CRITICAL ha_test_util.py:44 HA alert generated, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type \'exceptions.SystemExit\'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 87, in test_stargate_master_power_off    self._host_power_off_test_cycle(host_of_stargate_master)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 469, in power_off_and_check_ha    self.wait_for_ha_alert(cutoff_usecs=latest_alert_start, **kwargs)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 418, in wait_for_ha_alert    interval=interval,  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.157. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""\nstring2 = """ERROR agave_util.py:64 Timed out waiting for VMs [u\'vm_353ca5\', u\'vm_e02d7f\'] power on CRITICAL ha_test_util.py:44 VMs [u\'vm_353ca5\', u\'vm_e02d7f\'] power on, Stack:File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 909, in <module>    main(FLAGS, sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 878, in main    worker.run(sync_state)  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 326, in run    if not self.__test_phase_wrapper(test_method):  File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message) ERROR nutanix_test_runner_worker.py:595 Test failed: 1exc_type: <type \'exceptions.SystemExit\'>exc_value: 1stack:   File "/main/qa/py/qa/agave/nutanix_test_runner_worker.py", line 502, in __test_phase_wrapper    func()  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 67, in test_zoo_keeper_leader_power_off    self._host_power_off_test_cycle(leader_host)  File "/main/qa/test/agave/acropolis_tests/ha/best_effort_power_off_test.py", line 27, in _host_power_off_test_cycle    self.ha_util.power_off_and_check_ha(host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 468, in power_off_and_check_ha    self.verify_vms_not_on_host(host_vms, host)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 617, in verify_vms_not_on_host    self.wait_for_vms_power_on(vm_names, per_vm_timeout)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 599, in wait_for_vms_power_on    interval=15)  File "/main/.python/qa/util/agave_tools/ha_test_util.py", line 44, in wait_for_true    CHECK(result, message)  File "/main/.python/util/base/log.py", line 204, in CHECK    FATAL(log_msg, **kwargs)  File "/main/.python/util/base/log.py", line 185, in FATAL    sys.exit(1) ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:696 Failed to get gflags from 10.5.132.156. ERROR nutanix_test.py:1699 Failed to save cluster configuration"""\n\nmatch = SequenceMatcher(None, string1, string2, autojunk=False).find_longest_match(0, len(string1), 0, len(string2))\nprint(match)\n
Run Code Online (Sandbox Code Playgroud)\n\n

输出:

\n\n
Match(a=110, b=156, size=534)\n
Run Code Online (Sandbox Code Playgroud)\n\n

可以肯定的是,我们可以检查所有匹配的块并找到最长的:

\n\n
>>> max(SequenceMatcher(None, string1, string2, autojunk=False).get_matching_blocks(),\n...     key=lambda m: m.size)\nMatch(a=110, b=156, size=534)\n
Run Code Online (Sandbox Code Playgroud)\n\n
\n\n

为了说明 ing 对一个更简单的示例的影响autojunk,让我们看看这里发生了什么:

\n\n
>>> a = "aa:bb:cc" + ":"*200\n>>> b = "aa:bb" + ":"*200\n>>> SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))\nMatch(a=0, b=0, size=6)     # : is classified as junk\n>>> SequenceMatcher(None, a, b, autojunk=False).find_longest_match(0, len(a), 0, len(b))\nMatch(a=8, b=5, size=200)   # : is NOT classified as junk\n
Run Code Online (Sandbox Code Playgroud)\n\n

在第一种情况下(默认情况下autojunk=True),:被认为是垃圾字符(它代表了至少 200 个项目长的序列的 1% 以上),因此,“对人们来说看起来正确”的最长匹配只有 6 个字符(最初的 6 个字符)。

\n\n

在第二种情况下(使用显式autojunk=False),垃圾启发法关闭,因此最长的匹配是最后 200 个字符。

\n\n

如果您对较短的序列(短于 200 个字符)重复相同的测试,您会发现autojunk没有什么区别,因为垃圾启发法已关闭(请参阅源代码)。

\n