Gio*_*ino 6 linux multithreading cpu-usage node.js
我正在试验nodejs工作线程的使用。我遇到了奇怪的延迟时间。
main.js
可以产生一系列快速工作线程的线程。worker.js
执行一次 cpu-boud 计算(生成素数);顺便说一句,
generatePrimes()
javascipt 函数只是 cpu-bound 计算的演示示例。在我的真实案例中,工作线程是一个绑定 c++ 库的 Nodejs 程序(进行语音识别,在 100% CPU 的情况下进行半秒)。
我的笔记本电脑:Ubuntu 20.04.2 LTS桌面环境,有8个核心:
$ inxi -C -M
Machine: Type: Laptop System: HP product: HP Laptop 17-by1xxx v: Type1ProductConfigId serial: <superuser/root required>
Mobo: HP model: 8531 v: 17.16 serial: <superuser/root required> UEFI: Insyde v: F.32 date: 12/14/2018
CPU: Topology: Quad Core model: Intel Core i7-8565U bits: 64 type: MT MCP L2 cache: 8192 KiB Speed: 700 MHz min/max: 400/4600 MHz Core speeds (MHz): 1: 700 2: 700 3: 700 4: 700 5: 700 6: 700 7: 700 8: 700
Run Code Online (Sandbox Code Playgroud)
$ echo "CPU threads: $(grep -c processor /proc/cpuinfo)"
CPU threads: 8
Run Code Online (Sandbox Code Playgroud)
我经历过,当独立运行(单线程)调用函数时,计算总共花费了约8 秒:worker.js
generatePrimes(2, 1e7)
问题
当我生成多个线程时,例如 6 个线程,几乎并行(请参阅下面的代码),我预计再次有大约 8 秒的时间(可能开销很小),与生成的线程数量无关(它们不运行吗?并行,有足够的 cpu 核心吗?)。相反,我得到的总时间比预计的要长约 8 秒。我总结了~20多秒?!为什么?
下面是使用 time/pidstat 的源代码和一些经过的测量:
main.js
// main.js
const { Worker } = require('worker_threads')
function runThread(workerData) {
return new Promise((resolve, reject) => {
const worker = new Worker('./worker.js', { workerData })
worker.on('message', resolve)
worker.on('error', reject)
worker.on('exit', (code) => {
if (code !== 0)
reject(new Error(`Worker stopped with exit code ${code}`))
})
})
}
async function main() {
const numThreads = + process.argv[2]
if ( !numThreads || numThreads < 1 ) {
console.error(`usage: ${process.argv[1]} number_of_threads`)
process.exit()
}
const min = 2
const max = 1e7
//
// run multiple threads, in "parallel":
//
// It simulates a rapid spawn ("parallel") of a specific number of thread computation.
// The main thread run numThreads times the same worker thread.
//
// Data results of each thread elaboration is just "done"
//
for (let i = 0; i < numeThreads; i++ )
setImmediate( async () => { console.log( await runThread({min, max}) ) } )
}
if (require.main === module)
main()
module.exports = { runThread }
Run Code Online (Sandbox Code Playgroud)
工人.js
// worker.js
const { threadId, workerData, parentPort } = require('worker_threads')
const { generatePrimes } = require('./generatePrimes')
// take parameters from main/parente thread
const { min, max } = workerData
// synchronous long-running CPU-bound computation
const primes = generatePrimes(min, max)
// communicate result to main thread;
// to avoid any suspect that elapsed times depend on a large amount of data exchange (the primes array in this case),
// the returned data is just a short string.
parentPort.postMessage( `Done. Thread id: ${threadId}` )
Run Code Online (Sandbox Code Playgroud)
生成Primes.js
// generatePrimes.js
// long running / CPU-bound calculation
function generatePrimes(start, range) {
const primes = []
let isPrime = true
let end = start + range
for (let i = start; i < end; i++) {
for (let j = start; j < Math.sqrt(end); j++) {
if (i !== j && i%j === 0) {
isPrime = false
break
}
}
if (isPrime) {
primes.push(i)
}
isPrime = true
}
return primes
}
function main() {
const min = 2
const max = 1e7
console.log( generatePrimes(min, max) )
}
if (require.main === module)
main()
module.exports = { generatePrimes }
Run Code Online (Sandbox Code Playgroud)
测试
测试 1:没有工作线程
generatePrimes.js
独立 -> 经过:~8 秒
$ /usr/bin/time -f "%E" pidstat 1 -u -e node generatePrimes
Linux 5.8.0-50-generic (giorgio-HP-Laptop-17-by1xxx) 22/04/2021 _x86_64_ (8 CPU)
09:19:05 UID PID %usr %system %guest %wait %CPU CPU Command
09:19:06 1000 247776 98,02 0,00 0,00 0,00 98,02 5 node
09:19:07 1000 247776 100,00 0,00 0,00 0,00 100,00 5 node
09:19:08 1000 247776 100,00 0,00 0,00 0,00 100,00 5 node
09:19:09 1000 247776 100,00 0,00 0,00 0,00 100,00 5 node
09:19:10 1000 247776 100,00 0,00 0,00 0,00 100,00 5 node
09:19:11 1000 247776 100,00 0,00 0,00 0,00 100,00 5 node
09:19:12 1000 247776 100,00 0,00 0,00 0,00 100,00 5 node
09:19:13 1000 247776 100,00 0,00 0,00 0,00 100,00 5 node
[
2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37,
41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89,
97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151,
157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223,
227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281,
283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359,
367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433,
439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503,
509, 521, 523, 541,
... 664479 more items
]
Average: 1000 247776 99,75 0,00 0,00 0,00 99,75 - node
0:08.60
Run Code Online (Sandbox Code Playgroud)
测试 2:SPAWN NR。1 线程
main.js
产卵编号。1 个线程 -> 经过:~8 秒(再次)
$ /usr/bin/time -f "%E" pidstat 1 -u -e node main 1
Linux 5.8.0-50-generic (giorgio-HP-Laptop-17-by1xxx) 22/04/2021 _x86_64_ (8 CPU)
your machine has 8 cores.
09:21:01 UID PID %usr %system %guest %wait %CPU CPU Command
09:21:02 1000 247867 95,00 2,00 0,00 0,00 97,00 3 node
09:21:03 1000 247867 100,00 0,00 0,00 0,00 100,00 3 node
09:21:04 1000 247867 100,00 0,00 0,00 0,00 100,00 3 node
09:21:05 1000 247867 100,00 0,00 0,00 0,00 100,00 3 node
09:21:06 1000 247867 100,00 0,00 0,00 0,00 100,00 3 node
09:21:07 1000 247867 100,00 0,00 0,00 0,00 100,00 3 node
09:21:08 1000 247867 100,00 0,00 0,00 0,00 100,00 3 node
09:21:09 1000 247867 100,00 1,00 0,00 0,00 101,00 3 node
Done. Thread id: 1
Average: 1000 247867 99,38 0,38 0,00 0,00 99,75 - node
0:08.50
Run Code Online (Sandbox Code Playgroud)
测试 3:SPAWN NR。6 个线程
多个 (6) 线程。-> 经过:~21 秒(再次)
$ /usr/bin/time -f "%E" pidstat 1 -u -e node main 6
Linux 5.8.0-50-generic (giorgio-HP-Laptop-17-by1xxx) 22/04/2021 _x86_64_ (8 CPU)
your machine has 8 cores.
09:23:38 UID PID %usr %system %guest %wait %CPU CPU Command
09:23:39 1000 247946 554,00 1,00 0,00 0,00 555,00 0 node
09:23:40 1000 247946 599,00 1,00 0,00 0,00 600,00 0 node
09:23:41 1000 247946 600,00 1,00 0,00 0,00 601,00 0 node
09:23:42 1000 247946 599,00 0,00 0,00 0,00 599,00 0 node
09:23:43 1000 247946 599,00 1,00 0,00 0,00 600,00 0 node
09:23:44 1000 247946 599,00 0,00 0,00 0,00 599,00 0 node
09:23:45 1000 247946 600,00 0,00 0,00 0,00 600,00 0 node
09:23:46 1000 247946 599,00 2,00 0,00 0,00 601,00 0 node
09:23:47 1000 247946 599,00 0,00 0,00 0,00 599,00 0 node
09:23:48 1000 247946 599,00 0,00 0,00 0,00 599,00 0 node
09:23:49 1000 247946 600,00 1,00 0,00 0,00 601,00 0 node
09:23:50 1000 247946 598,00 1,00 0,00 0,00 599,00 0 node
09:23:51 1000 247946 599,00 2,00 0,00 0,00 601,00 0 node
Done. Thread id: 1
Done. Thread id: 4
09:23:52 1000 247946 430,00 0,00 0,00 0,00 430,00 0 node
09:23:53 1000 247946 398,00 0,00 0,00 0,00 398,00 0 node
09:23:54 1000 247946 399,00 1,00 0,00 0,00 400,00 0 node
09:23:55 1000 247946 398,00 0,00 0,00 0,00 398,00 0 node
09:23:56 1000 247946 399,00 0,00 0,00 0,00 399,00 0 node
09:23:57 1000 247946 396,00 3,00 0,00 0,00 399,00 0 node
09:23:58 1000 247946 399,00 0,00 0,00 0,00 399,00 0 node
Done. Thread id: 5
Done. Thread id: 6
09:23:59 1000 247946 399,00 1,00 0,00 0,00 400,00 7 node
Done. Thread id: 2
Done. Thread id: 3
Average: 1000 247946 522,00 0,71 0,00 0,00 522,71 - node
0:21.05
Run Code Online (Sandbox Code Playgroud)
为什么我得到了~20 秒而不是预期的~8 秒?我哪里错了?
更新
为了清楚起见,我将 CPU 绑定函数generatePrimes 分离到一个单独的模块中。
我添加了更多经过的测试,将线程数量从 1 增加到 9。测试表明,经过的时间随着生成的线程数量的增加而增加。这对我来说毫无意义:(
$ /usr/bin/time -f "%E" node main 1
your machine has 8 cores.
Done. Thread id: 1
0:08.86
$ /usr/bin/time -f "%E" node main 2
your machine has 8 cores.
Done. Thread id: 2
Done. Thread id: 1
0:13.96
$ /usr/bin/time -f "%E" node main 3
your machine has 8 cores.
Done. Thread id: 2
Done. Thread id: 1
Done. Thread id: 3
0:16.71
$ /usr/bin/time -f "%E" node main 4
your machine has 8 cores.
Done. Thread id: 3
Done. Thread id: 2
Done. Thread id: 4
Done. Thread id: 1
0:21.87
$ /usr/bin/time -f "%E" node main 5
your machine has 8 cores.
Done. Thread id: 3
Done. Thread id: 2
Done. Thread id: 5
Done. Thread id: 1
Done. Thread id: 4
0:22.20
$ /usr/bin/time -f "%E" node main 6
your machine has 8 cores.
Done. Thread id: 3
Done. Thread id: 4
Done. Thread id: 6
Done. Thread id: 2
Done. Thread id: 5
Done. Thread id: 1
0:23.74
$ /usr/bin/time -f "%E" node main 7
your machine has 8 cores.
Done. Thread id: 3
Done. Thread id: 4
Done. Thread id: 7
Done. Thread id: 2
Done. Thread id: 5
Done. Thread id: 1
Done. Thread id: 6
0:32.00
$ /usr/bin/time -f "%E" node main 8
your machine has 8 cores.
Done. Thread id: 6
Done. Thread id: 3
Done. Thread id: 2
Done. Thread id: 5
Done. Thread id: 1
Done. Thread id: 8
Done. Thread id: 7
Done. Thread id: 4
0:35.92
$ /usr/bin/time -f "%E" node main 9
your machine has 8 cores.
warning: number of requested threads (9) is higher than number of available cores (8)
Done. Thread id: 8
Done. Thread id: 4
Done. Thread id: 6
Done. Thread id: 9
Done. Thread id: 2
Done. Thread id: 3
Done. Thread id: 7
Done. Thread id: 5
Done. Thread id: 1
0:40.27
Run Code Online (Sandbox Code Playgroud)
顺便说一句,相关问题:为什么多次运行同一程序的程序执行时间不同?
您正在遇到称为艾哈姆达尔定律的实际限制。如果您拥有两倍的处理器,您通常不会获得两倍的计算吞吐量。为什么不?
有几个原因,通常很难梳理出来并单独衡量,包括:
而且,笔记本电脑不具备与巨型多核服务器相同的耗能密集型 RAM 和总线结构,因此竞争更加激烈。它们更多地针对桌面用例而设计,其中各种用户界面进程共享内核和超线程。
如果每个真实核心有一个工作线程,则您的主 Nodejs 进程也必须与您的线程共享。正如您的 Xorg 服务器、您的文件系统以及 Focal Fossa 上所有几十个真正有用的守护进程一样。
如果这对于您的容量规划来说是一个关键问题,请花几十欧元/美元在云供应商之一租用一台大型 24 或 32 核服务器来运行您的实验。与您真实的信号处理工作负载一起。这是一个更有用的测试。如果您租用他们提供的最多核心,您可能会获得整台机器,而不会与其他客户共享。
不要浪费时间试图了解笔记本电脑主板中的省钱省电的快捷方式和低劣的硬件黑客。
(这位老前辈曾经在一家计算机公司从事现场软件支持工作。我必须一遍又一遍地向销售人员解释艾哈姆达尔定律,这样他们就不会过度销售该公司昂贵得离谱的新型并行处理产品。他们仍然这样做过度推销他们。直到一些大客户要求退款才教会了他们。)
归档时间: |
|
查看次数: |
3203 次 |
最近记录: |