Mer*_*y05 2 c++ graph boost-graph
我正在编写一个程序来处理图表。我在循环中定义了一个图,并向其中添加了顶点和边(大约 4k 个顶点和 40k 个边)。但是当一轮循环结束后,需要很长时间才能开始下一轮循环。
typedef adjacency_list <listS, vecS, bidirectionalS, property<vertex_name_t, LineLabel>> LineGraph
for (int i = 0; i < num_comp; i++) {
if (num_vertices(subgraphs[i]) == 1) {
continue;
}
cout << "Subgraph " << i + 1 << ":" << endl;
cout << "has " << num_vertices(subgraphs[i]) << " vertices" << endl;
LineGraph llineg;
get_line_graph(subgraphs[i], llineg);//vertices and edges are added here
}
Run Code Online (Sandbox Code Playgroud)
调试时,似乎当一个循环结束时,程序会跳转到一个名为scoped_ptr.hpp的文件,并运行
~scoped_ptr() BOOST_SP_NOEXCEPT
{
#if defined(BOOST_SP_ENABLE_DEBUG_HOOKS)
boost::sp_scalar_destructor_hook( px );
#endif
boost::checked_delete( px );
}
Run Code Online (Sandbox Code Playgroud)
然后它转到checked_delete.hpp并运行
template<class T> inline void checked_delete(T * x) BOOST_NOEXCEPT
{
// intentionally complex - simplification causes regressions
typedef char type_must_be_complete[ sizeof(T)? 1: -1 ];
(void) sizeof(type_must_be_complete);
delete x;
}
Run Code Online (Sandbox Code Playgroud)
我猜图中的边太多了,所以需要很长时间才能将它们从内存中删除。
我应该做什么才能让它更快?
我针对这个问题做了一个小测试。该图有 10617 个顶点和 72172 条边。
int main() {
typedef adjacency_list<setS, vecS, bidirectionalS> G;
timer t;
for (int j = 0; j < 2; j++) {
cout << t.elapsed() << endl;
int num_v = 10617;
int num_e = 72172;
G g;
for (int i = 0; i < num_v; i++) {
add_vertex(g);
}
ifstream infile("wordassociation-2011.txt"); //read edges from file
int a, b;
char c;
int i = 0;
for (int j = 0; j < num_a; ++j) {
infile >> a >> c >> b;
add_edge(a, b, g);
i++;
if (i % 5000 == 0) {
cout << "reading..." << endl;
}
}
cout << t.elapsed() << endl;
}
}
Run Code Online (Sandbox Code Playgroud)
结果显示,大约需要2分钟才能开始下一个循环。结果 0 的图片正在阅读...正在阅读...正在阅读...正在阅读...正在阅读...正在阅读...正在阅读...正在阅读...正在阅读...正在阅读...正在阅读...正在阅读。 ..正在阅读...正在阅读...0.981 122.192正在阅读...正在阅读...
所以,我不能独自离开:)
\n我从http://w3.usf.edu/FreeAssociation/AppendixA/index.html下载了数据集(显然这是您的来源,因为计数完全匹配)。
\n我编写了一个解析器,它读取Row数据类型:
// part of speech\nenum POS { Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Interjection, Conjunction, Other };\n\nusing Atom = AtomPool::Atom;\n\nstruct Association {\n Atom cue, target;\n bool normed_;\n unsigned _g; // norm_group\n unsigned _p; // partic_response\n double fsg; // forward_strength\n double bsg; // backward_strength\n double msg; // mediated_strength "2-step strength"\n double osg; // overlapping_strength\n unsigned _m; // mediators\n unsigned mmia; // mediators_missing_in_action\n double _o; // overlapping_associates shared by cue and target\n unsigned omia; // overlapping_missing_in_action\n};\n\nstruct WordAttributes {\n unsigned ss; // set size\n unsigned fr; // frequency\n double con; // concreteness\n bool h; // is homograph?\n POS ps; // part of speech\n double mc; // mean connectivity among its associates\n double pr; // probablity of a resonant connection\n double rsg; // resonant strength\n unsigned uc; // use code\n};\n\nstruct Row {\n Association assoc;\n WordAttributes qattr, tattr;\n};\nRun Code Online (Sandbox Code Playgroud)\n我通过使用字符串原子进行了一些优化,因为字符串可能会被大量重用。我是对的(请参阅下面的演示打印的统计数据)。
\n// #define BOOST_SPIRIT_X3_DEBUG\n#include <boost/container/flat_set.hpp>\n#include <boost/iostreams/device/mapped_file.hpp>\n#include <boost/spirit/home/x3.hpp>\n\n#include <iomanip>\n#include <iostream>\n\n#include <chrono>\n#include <iomanip>\n#include <iostream>\n#include <utility>\n\nnamespace {\n using boost::container::flat_set;\n\n static long elapsed() {\n auto now = std::chrono::high_resolution_clock::now;\n using namespace std::chrono_literals;\n static auto start = now();\n auto t = now();\n\n return (t - std::exchange(start, t)) / 1ms;\n }\n\n void trace(auto const&... args) {\n ((std::cout << std::setw(10) << elapsed() << "ms ") << ... << args) << std::endl;\n }\n} // namespace\n\nstruct AtomPool {\n using Atom = std::uint32_t; // Backing::size_type;\n static constexpr Atom max() { return std::numeric_limits<Atom>::max(); }\n\n size_t size() const { return _index.size(); }\n size_t size_bytes() const { return _backing.size(); }\n\n AtomPool() {\n _backing.reserve(100\'000);\n _index.reserve(12\'000);\n }\n\n ~AtomPool() {\n _index.clear();\n _backing.clear();\n if (_insertions)\n trace("~AtomPool deduplicated ", (_dedup * 100.0 / _insertions), "% of ", _insertions,\n " insertions");\n }\n\n private:\n using Backing = std::vector<char>;\n\n struct Cmp {\n AtomPool const& _pool;\n using is_transparent = void;\n\n template <typename... T> bool operator()(T const&... v) const { return (view(v) < ...); }\n\n private:\n std::string_view view(Atom id) const { return _pool.lookup(id); }\n std::string_view view(std::string_view sv) const { return sv; }\n };\n\n Backing _backing;\n flat_set<Atom, Cmp> _index{Cmp{*this}};\n size_t _insertions = 0, _dedup = 0;\n\n public:\n Atom atomize(std::string const& text) {\n if (_backing.size() > max())\n throw std::range_error("AtomPool");\n\n _insertions += 1;\n\n if (auto it = _index.find(text); it != _index.end()) {\n _dedup += 1;\n return *it;\n }\n\n auto pos = _backing.size();\n _backing.insert(_backing.end(), begin(text), end(text));\n _backing.push_back(\'\\0\');\n\n _index.insert(pos);\n return pos;\n }\n\n std::string_view lookup(Atom id) const {\n assert(id < _backing.size());\n return _backing.data() + id;\n }\n};\n\nnamespace DataSet {\n\n /*\n * |--------------+-----------------------------------------------------|\n * | Abbreviation | Equivalence |\n * |==============+=====================================================|\n * | CUE | Normed Word |\n * | TARGET | Response to Normed Word |\n * | NORMED? | Is Response Normed? |\n * | #G | Group size |\n * | # | P Number of Participants Producing Response |\n * | FSG | Forward Cue-to-Target Strength |\n * | BSG | Backward Target-to-Cue Strength |\n * | MSG | Mediated Strength |\n * | OSG | Overlapping Associate Strength |\n * | # | M Number of Mediators |\n * | MMIA | Number of Non-Normed Potential Mediating Associates | \n * | # | O Number of Overlaping Associates |\n * | OMIA | Number of Non-Normed Overlapping Associates |\n * | QSS | Cue: Set Size |\n * | QFR | Cue: Frequency |\n * | QCON | Cue: Concreteness |\n * | QH | Cue is a Homograph? |\n * | QPS | Cue: Part of Speech |\n * | QMC | Cue: Mean Connectivity Among Its Associates |\n * | QPR | Cue: Probability of a Resonant Connection |\n * | QRSG | Cue: Resonant Strength |\n * | QUC | Cue: Use Code |\n * | TSS | Target: Set Size |\n * | TFR | Target: Frequency |\n * | TCON | Target: Concreteness |\n * | TH | Target is a Homograph? |\n * | TPS | Target: Part of Speech |\n * | TMC | Target: Mean Connectivity Among Its Assoc. |\n * | TPR | Target: Probability of a Resonant Connection |\n * | TRSG | Target: Resonant Strength |\n * | TUC | Target: Use Code |\n * |--------------+-----------------------------------------------------|\n */\n enum ColId {\n CUE, TARGET, NORMED_, _G, _P, FSG, BSG, MSG, OSG, _M, MMIAS, _O, OMIAS, QSS, //\n QFR, QCON, QH, QPS, QMC, QPR, QRSG, QUC, TSS, TFR, TCON, TH, TPS, TMC, TPR, //\n TRSG, TUC, //\n NUMCOLUMNS\n };\n\n // part of speech\n enum POS { Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Interjection, Conjunction, Other };\n\n using Atom = AtomPool::Atom;\n\n struct Association {\n Atom cue, target;\n bool normed_;\n unsigned _g; // norm_group\n unsigned _p; // partic_response\n double fsg; // forward_strength\n double bsg; // backward_strength\n double msg; // mediated_strength "2-step strength"\n double osg; // overlapping_strength\n unsigned _m; // mediators\n unsigned mmia; // mediators_missing_in_action\n double _o; // overlapping_associates shared by cue and target\n unsigned omia; // overlapping_missing_in_action\n };\n\n struct WordAttributes {\n unsigned ss; // set size\n unsigned fr; // frequency\n double con; // concreteness\n bool h; // is homograph?\n POS ps; // part of speech\n double mc; // mean connectivity among its associates\n double pr; // probablity of a resonant connection\n double rsg; // resonant strength\n unsigned uc; // use code\n };\n\n struct Row {\n Association assoc;\n WordAttributes qattr, tattr;\n };\n} // namespace DataSet\n //\n#include <boost/fusion/adapted.hpp>\n\nBOOST_FUSION_ADAPT_STRUCT(DataSet::WordAttributes, ss, fr, con, h, ps, mc, pr, rsg, uc)\nBOOST_FUSION_ADAPT_STRUCT(DataSet::Association, cue, target, normed_, _g, _p, fsg, bsg, msg, osg, _m, mmia, _o, omia)\nBOOST_FUSION_ADAPT_STRUCT(DataSet::Row, assoc, qattr, tattr)\n\nstruct Reader {\n using Atom = DataSet::Atom;\n using Association = DataSet::Association;\n using Attributes = DataSet::WordAttributes;\n using POS = DataSet::POS;\n\n using Row = DataSet::Row;\n using File = std::vector<Row>;\n\n Reader(std::string const& fname, AtomPool& pool) : _atoms(pool), _mfs{fname} { parse(); }\n\n private:\n AtomPool& _atoms;\n boost::iostreams::mapped_file_source _mfs;\n File _rows;\n\n using It = char const*;\n\n void parse() try {\n It const f = _mfs.begin(), l = _mfs.end();\n\n long const total_bytes = l - f;\n _rows.reserve(total_bytes / 150);\n namespace x3 = boost::spirit::x3;\n namespace enc = x3::standard; // iso8859_1;\n using enc::blank;\n using enc::char_;\n using enc::lit;\n using enc::space;\n // using enc::symbols;\n\n auto atom = [this](auto& ctx) {\n auto& vec = _attr(ctx);\n _val(ctx) = _atoms.atomize(std::string(vec.begin(), vec.end()));\n };\n\n auto normed = x3::symbols<bool>{}.add("NO", false)("YES", true).sym;\n normed.name("YES|NO");\n\n auto col = x3::rule<Atom, Atom>{"col"} = x3::lexeme[*~char_(",\\r\\n")][atom];\n\n auto header = x3::eps //\n >> "CUE" >> \',\' //\n >> "TARGET" >> \',\' //\n >> "NORMED?" >> \',\' //\n >> "#G" >> \',\' //\n >> "#P" >> \',\' //\n >> "FSG" >> \',\' //\n >> "BSG" >> \',\' //\n >> "MSG" >> \',\' //\n >> "OSG" >> \',\' //\n >> "#M" >> \',\' //\n >> "MMIAS" >> \',\' //\n >> "#O" >> \',\' //\n >> "OMIAS" >> \',\' //\n >> "QSS" >> \',\' //\n >> "QFR" >> \',\' //\n >> "QCON" >> \',\' //\n >> "QH" >> \',\' //\n >> "QPS" >> \',\' //\n >> "QMC" >> \',\' //\n >> "QPR" >> \',\' //\n >> "QRSG" >> \',\' //\n >> "QUC" >> \',\' //\n >> "TSS" >> \',\' //\n >> "TFR" >> \',\' //\n >> "TCON" >> \',\' //\n >> "TH" >> \',\' //\n >> "TPS" >> \',\' //\n >> "TMC" >> \',\' //\n >> "TPR" >> \',\' //\n >> "TRSG" >> \',\' //\n >> "TUC" >> x3::eol;\n\n auto yen = lit(\'\\xa5\') | x3::iso8859_1::lit("\xc2\xa5");\n\n auto eoc = &(\',\' | x3::eol | x3::eoi); // end-of-column predicate\n\n auto strength //\n = x3::double_ >> eoc //\n | -yen >> x3::attr(NAN);\n\n auto count //\n = x3::uint_ >> &(\',\' | x3::eol | x3::eoi) //\n | -yen >> x3::attr(-1u);\n\n auto homograph = x3::matches[enc::graph - \',\'];\n auto part_of_speech = //\n x3::symbols<POS>{}\n .add("N", POS::Noun) //\n ("V", POS::Verb) //\n ("AJ", POS::Adjective) //\n ("AD", POS::Adverb) //\n ("P", POS::Pronoun) //\n ("PP", POS::Preposition) //\n ("I", POS::Interjection) //\n ("C", POS::Conjunction) //\n // these were undocumented but deduced from context\n ("INT", POS::Interjection) //\n ("A", POS::Adverb) //\n ("AV", POS::Adverb) //\n ("ADV", POS::Adverb) //\n ("ADJ", POS::Adjective) //\n ("PRP", POS::Pronoun) // "ACCOUNT" probably mistagged in dataset\n .sym |\n eoc >> x3::attr(POS::Other);\n\n auto attributes = x3::rule<Attributes, Attributes>{"attributes"} = x3::eps\n\n > count > \',\' // SS\n > count > \',\' // FR\n > strength > \',\' // CON\n > homograph > \',\' // H\n > part_of_speech > \',\' // PS\n > strength > \',\' // M\n > strength > \',\' // PR\n > strength > \',\' // RSG\n > count; // UC\n\n auto association = x3::rule<Association, Association>{"association"} = (*header >> !x3::eoi)\n\n > col > \',\' // CUE\n > col > \',\' // TARGET\n > normed > \',\' // NORMED?\n > count > \',\' // #G\n > count > \',\' // #P\n > strength > \',\' // FSG\n > strength > \',\' // BSG\n > strength > \',\' // MSG\n > strength > \',\' // OSG\n > count > \',\' // #M\n > count > \',\' // MMIAS\n > strength > \',\' // #O\n > count; // OMIAS\n\n auto row = x3::rule<Row, Row>{"row"} = //\n (*header >> !x3::eoi) //\n > association > \',\' // assoc\n > attributes > \',\' // qattr\n > attributes; // tattr\n\n auto file = x3::rule<File, File>{"file"} = *(row >> x3::eol);\n\n phrase_parse(f, l, x3::eps > file > x3::eoi, blank, _rows);\n\n using DataSet::NUMCOLUMNS;\n trace("Parsed: ", total_bytes / 1024.0 / 1024, "MiB ", "into ", _rows.size(), " rows");\n\n trace(_atoms.size(), " atoms totaling ", _atoms.size_bytes(), " bytes of backing storage");\n } catch (boost::spirit::x3::expectation_failure<It> const& ef) {\n It b = _mfs.begin(), e = _mfs.end();\n auto w = ef.where();\n auto sol = b + std::string_view(b, w).find_last_of("\\r\\n") + 1;\n\n auto line = std::string_view(sol, e);\n line = line.substr(0, line.find_first_of("\\r\\n"));\n\n auto l = 1 + std::count(b, w, \'\\n\'), c = 1 + (w - sol);\n trace("Expected ", ef.which(), " at L:", l, ":", c, " at\\n", //\n " ", line, "\\n", //\n std::setw(c + 4), \'^\', "----- here"); //\n std::exit(1);\n }\n};\n\nint main(int argc, char** argv) {\n AtomPool pool;\n\n trace("Start");\n {\n for (std::string fname : std::vector(argv+1, argv+argc))\n Reader r{fname, pool};\n }\n trace("Done");\n}\nRun Code Online (Sandbox Code Playgroud)\n对于小型演示数据集,将打印:
\n./a.out /Archive2/bd/de4002f5d4d3ee/main.cpp\n 0ms Start\n 54ms Parsed: 0.00769901MiB into 40 rows\n 0ms 70 atoms totaling 462 bytes of backing storage\n 0ms Done\n 0ms ~AtomPool deduplicated 12.5% of 80 insertions\nRun Code Online (Sandbox Code Playgroud)\n在本地,整个数据集的解析时间约为 90 毫秒:
\n/build/sotest dataset/Cue_Target_Pairs.* dataset/full.txt \n\n 0ms Start\n 14ms Parsed: 1.15371MiB into 8476 rows\n 0ms 3732 atoms totaling 26643 bytes of backing storage\n 13ms Parsed: 1.03647MiB into 7590 rows\n 0ms 5340 atoms totaling 39238 bytes of backing storage\n 19ms Parsed: 1.42218MiB into 10500 rows\n 0ms 6901 atoms totaling 51720 bytes of backing storage\n 15ms Parsed: 1.14505MiB into 8440 rows\n 0ms 7917 atoms totaling 59821 bytes of backing storage\n 14ms Parsed: 1.26198MiB into 9287 rows\n 0ms 8709 atoms totaling 66346 bytes of backing storage\n 15ms Parsed: 1.35513MiB into 9888 rows\n 0ms 9456 atoms totaling 72769 bytes of backing storage\n 14ms Parsed: 1.25216MiB into 9214 rows\n 0ms 10055 atoms totaling 77640 bytes of backing storage\n 15ms Parsed: 1.19052MiB into 8781 rows\n 0ms 10617 atoms totaling 82271 bytes of backing storage\n 86ms Parsed: 9.8172MiB into 72176 rows\n 0ms 10617 atoms totaling 82271 bytes of backing storage\n 0ms Done\n 0ms ~AtomPool deduplicated 96.3225% of 288704 insertions\nRun Code Online (Sandbox Code Playgroud)\n让我们创建所有(规范和非规范)单词作为顶点,并将所有关联(输入行)添加为边。
\n#include <boost/graph/adjacency_list.hpp>\nusing VertexProp = DataSet::WordAttributes;\nusing EdgeProp = DataSet::Association;\nusing Graph = boost::adjacency_list<boost::vecS, boost::vecS, boost::directedS, VertexProp, EdgeProp>;\nRun Code Online (Sandbox Code Playgroud)\n使用从原子到顶点描述符的临时映射,实现buildGraph可以非常简单:
Graph buildGraph() const {\n using V = Graph::vertex_descriptor;\n\n Graph g;\n g.m_vertices.reserve(_atoms.size());\n\n std::map<Atom, V> vmap; // not flat-map for stability\n\n for (auto& [assoc, qattr, tattr] : _rows) {\n auto s = vmap.find(assoc.cue);\n auto t = vmap.find(assoc.target);\n if (s == vmap.end()) s = vmap.emplace(assoc.cue, add_vertex(qattr, g)).first;\n if (t == vmap.end()) t = vmap.emplace(assoc.target, add_vertex(tattr, g)).first;\n\n assert(g[s->second] == qattr); // check consistent input for qattr/tattr\n assert(g[t->second] == tattr);\n\n add_edge(s->second, t->second, assoc, g);\n }\n\n return g;\n}\nRun Code Online (Sandbox Code Playgroud)\n在我们的main解析后构建图表:
int main(int argc, char** argv) {\n AtomPool pool;\n\n trace("Start");\n for (std::string fname : std::vector(argv + 1, argv + argc)) {\n Reader r{fname, pool};\n trace("Parsed ", fname);\n {\n Graph g = r.buildGraph();\n trace("Graph creation done; ", num_vertices(g), " vertices and ", num_edges(g), " edges");\n }\n trace("Graph destruction done");\n }\n trace("Done");\n}\nRun Code Online (Sandbox Code Playgroud)\n\n./a.out /Archive2/bd/de4002f5d4d3ee/main.cpp\n 0ms Start\n 322ms Parsed: 0.00769901MiB into 40 rows\n 0ms Parsed /Archive2/bd/de4002f5d4d3ee/main.cpp\n 0ms Graph creation done; 70 vertices and 40 edges\n 0ms Graph destruction done\n 0ms Done\n 0ms ~AtomPool deduplicated 12.5% of 80 insertions\nRun Code Online (Sandbox Code Playgroud)\n对于完整数据集,输出变为:
\n 0ms Start\n 93ms Parsed: 9.8172MiB into 72176 rows\n 0ms Parsed dataset/ful