Add proper command line interface for extraction

This commit is contained in:
Michal Szczepanski 2019-07-22 20:58:24 +02:00
parent 195ecdad63
commit 738aecf068
5 changed files with 42 additions and 18 deletions

@ -6,6 +6,10 @@ and [node.js](https://nodejs.org).
## Work in progress
### Usage
``git clone https://github.com/vane/pdf-gold-digger``
``gd -f some.pdf``
### Supports:
- extract text
- separate each page

24
gd.js Normal file

@ -0,0 +1,24 @@
const GoldDigger = require('./lib/GoldDigger');
const minimist = require('minimist');
const help = `
--file -f pdf file location
--debug -d show debug information
`
// converts argument to boolean
const toBool = (val) => {
return val === 'true' || val === 1 || val === true;
}
const argv = minimist(process.argv.slice(2))
const fpath = argv['file'] || argv['f'];
let debug = argv['debug'] || argv['d'];
debug = toBool(debug);
if(!fpath) {
console.log(help);
console.log(argv);
return;
}
if(debug) console.log(fpath);
const gd = new GoldDigger();
gd.dig(fpath, debug)

@ -14,23 +14,25 @@ class GoldDigger {
this.txt = new ExtractText();
}
async dig(fpath) {
async dig(fpath, debug) {
if (!fs.existsSync(fpath)) {
throw GoldDiggerError(`File not exists ${fpath}`);
throw new GoldDiggerError(`File not exists ${fpath}`);
}
console.log('Reading pdf');
if(debug) console.log('Reading pdf');
// configuration
const config = {};
config.paintFormXObject = false;
config.paintImageMaskXObject = false;
config.paintJpegXObject = false;
const data =fs.readFileSync(fpath);
console.log(data.length);
// read file
const data = fs.readFileSync(fpath);
if(debug) console.log(data.length);
const doc = await pdf.getDocument({
data:data,
});
console.log(`Pages : ${doc.numPages}`);
for(let pageNum = 1;pageNum<=doc.numPages;pageNum++) {
}).promise;
if(debug) console.log(`Pages : ${doc.numPages}`);
// read pages
for(let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
const page = await doc.getPage(pageNum);
const viewport = page.getViewport({ scale: 1.0, });
//const text = await page.extractTextContent();
@ -39,12 +41,9 @@ class GoldDigger {
// load dependencies
const dependencies = await this.loadDependencies(page, operatorList);
const opTree = this.convertOpList(operatorList);
console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`);
if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`);
const output = this.executeOpTree(opTree, page, dependencies, config);
console.log(`--- END Page ${pageNum} objects : ${output.length}`)
/*output.forEach(el => {
console.log(el.toString())
});*/
if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`)
}
}

@ -1,4 +0,0 @@
const GoldDigger = require('./lib/GoldDigger');
const gd = new GoldDigger();
gd.dig('../../github.com/pdf.js/web/compressed.tracemonkey-pldi-09.pdf');

@ -3,10 +3,11 @@
"version": "0.0.1",
"description": "Pdf information extraction library based on pdf.js and node.js",
"author": "Michal Szczepanski <michal@vane.pl>",
"main": "main.js",
"main": "gd.js",
"license": "MIT",
"repository": "szczepano/pdf-gold-digger",
"dependencies": {
"minimist": "^1.2.0",
"pdfjs-dist": "^2.1.266"
}
}