我想从 docx 文件中提取文本,我尝试过使用 mammoth
var mammoth = require("mammoth");
mammoth.extractRawText({path: "./doc.docx"})
.then(function(result){
var text = result.value; // The raw text
//this prints all the data of docx file
console.log(text);
for (var i = 0; i < text.length; i++) {
//this prints all the data char by char in separate lines
console.log(text[i]);
}
var messages = result.messages;
})
.done();
Run Code Online (Sandbox Code Playgroud)
但这里的问题是,在这个 for 循环中,我想要逐行数据而不是逐字符数据,请在这里帮助我,或者您知道还有其他方法吗?
一种方法是获取整个文本,然后按 分割'\n':
import superagent from 'superagent';
import mammoth from 'mammoth';
const url = 'http://www.ojk.ee/sites/default/files/respondus-docx-sample-file_0.docx';
const main = async () => {
const response = await superagent.get(url)
.parse(superagent.parse.image)
.buffer();
const buffer = response.body;
const text = (await mammoth.extractRawText({ buffer })).value;
const lines = text.split('\n');
console.log(lines);
};
main().catch(error => console.error(error));
Run Code Online (Sandbox Code Playgroud)
| 归档时间: |
|
| 查看次数: |
14621 次 |
| 最近记录: |