Add proper command line interface for extraction
This commit is contained in:
parent
195ecdad63
commit
738aecf068
@ -6,6 +6,10 @@ and [node.js](https://nodejs.org).
|
||||
|
||||
## Work in progress
|
||||
|
||||
### Usage
|
||||
``git clone https://github.com/vane/pdf-gold-digger``
|
||||
``gd -f some.pdf``
|
||||
|
||||
### Supports:
|
||||
- extract text
|
||||
- separate each page
|
||||
|
24
gd.js
Normal file
24
gd.js
Normal file
@ -0,0 +1,24 @@
|
||||
const GoldDigger = require('./lib/GoldDigger');
|
||||
const minimist = require('minimist');
|
||||
|
||||
const help = `
|
||||
--file -f pdf file location
|
||||
--debug -d show debug information
|
||||
`
|
||||
// converts argument to boolean
|
||||
const toBool = (val) => {
|
||||
return val === 'true' || val === 1 || val === true;
|
||||
}
|
||||
|
||||
const argv = minimist(process.argv.slice(2))
|
||||
const fpath = argv['file'] || argv['f'];
|
||||
let debug = argv['debug'] || argv['d'];
|
||||
debug = toBool(debug);
|
||||
if(!fpath) {
|
||||
console.log(help);
|
||||
console.log(argv);
|
||||
return;
|
||||
}
|
||||
if(debug) console.log(fpath);
|
||||
const gd = new GoldDigger();
|
||||
gd.dig(fpath, debug)
|
@ -14,23 +14,25 @@ class GoldDigger {
|
||||
this.txt = new ExtractText();
|
||||
}
|
||||
|
||||
async dig(fpath) {
|
||||
async dig(fpath, debug) {
|
||||
if (!fs.existsSync(fpath)) {
|
||||
throw GoldDiggerError(`File not exists ${fpath}`);
|
||||
throw new GoldDiggerError(`File not exists ${fpath}`);
|
||||
}
|
||||
console.log('Reading pdf');
|
||||
if(debug) console.log('Reading pdf');
|
||||
// configuration
|
||||
const config = {};
|
||||
config.paintFormXObject = false;
|
||||
config.paintImageMaskXObject = false;
|
||||
config.paintJpegXObject = false;
|
||||
const data =fs.readFileSync(fpath);
|
||||
console.log(data.length);
|
||||
// read file
|
||||
const data = fs.readFileSync(fpath);
|
||||
if(debug) console.log(data.length);
|
||||
const doc = await pdf.getDocument({
|
||||
data:data,
|
||||
});
|
||||
console.log(`Pages : ${doc.numPages}`);
|
||||
for(let pageNum = 1;pageNum<=doc.numPages;pageNum++) {
|
||||
}).promise;
|
||||
if(debug) console.log(`Pages : ${doc.numPages}`);
|
||||
// read pages
|
||||
for(let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
|
||||
const page = await doc.getPage(pageNum);
|
||||
const viewport = page.getViewport({ scale: 1.0, });
|
||||
//const text = await page.extractTextContent();
|
||||
@ -39,12 +41,9 @@ class GoldDigger {
|
||||
// load dependencies
|
||||
const dependencies = await this.loadDependencies(page, operatorList);
|
||||
const opTree = this.convertOpList(operatorList);
|
||||
console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`);
|
||||
if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`);
|
||||
const output = this.executeOpTree(opTree, page, dependencies, config);
|
||||
console.log(`--- END Page ${pageNum} objects : ${output.length}`)
|
||||
/*output.forEach(el => {
|
||||
console.log(el.toString())
|
||||
});*/
|
||||
if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`)
|
||||
}
|
||||
}
|
||||
|
||||
|
4
main.js
4
main.js
@ -1,4 +0,0 @@
|
||||
const GoldDigger = require('./lib/GoldDigger');
|
||||
|
||||
const gd = new GoldDigger();
|
||||
gd.dig('../../github.com/pdf.js/web/compressed.tracemonkey-pldi-09.pdf');
|
@ -3,10 +3,11 @@
|
||||
"version": "0.0.1",
|
||||
"description": "Pdf information extraction library based on pdf.js and node.js",
|
||||
"author": "Michal Szczepanski <michal@vane.pl>",
|
||||
"main": "main.js",
|
||||
"main": "gd.js",
|
||||
"license": "MIT",
|
||||
"repository": "szczepano/pdf-gold-digger",
|
||||
"dependencies": {
|
||||
"minimist": "^1.2.0",
|
||||
"pdfjs-dist": "^2.1.266"
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user