diff --git a/README.md b/README.md index aef0a9a..120f6f5 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,10 @@ and [node.js](https://nodejs.org). ## Work in progress +### Usage +``git clone https://github.com/vane/pdf-gold-digger`` +``gd -f some.pdf`` + ### Supports: - extract text - separate each page diff --git a/gd.js b/gd.js new file mode 100644 index 0000000..5de9e41 --- /dev/null +++ b/gd.js @@ -0,0 +1,24 @@ +const GoldDigger = require('./lib/GoldDigger'); +const minimist = require('minimist'); + +const help = ` +--file -f pdf file location +--debug -d show debug information +` +// converts argument to boolean +const toBool = (val) => { + return val === 'true' || val === 1 || val === true; +} + +const argv = minimist(process.argv.slice(2)) +const fpath = argv['file'] || argv['f']; +let debug = argv['debug'] || argv['d']; +debug = toBool(debug); +if(!fpath) { + console.log(help); + console.log(argv); + return; +} +if(debug) console.log(fpath); +const gd = new GoldDigger(); +gd.dig(fpath, debug) diff --git a/lib/GoldDigger.js b/lib/GoldDigger.js index e35fb94..7e1ee5a 100644 --- a/lib/GoldDigger.js +++ b/lib/GoldDigger.js @@ -14,23 +14,25 @@ class GoldDigger { this.txt = new ExtractText(); } - async dig(fpath) { + async dig(fpath, debug) { if (!fs.existsSync(fpath)) { - throw GoldDiggerError(`File not exists ${fpath}`); + throw new GoldDiggerError(`File not exists ${fpath}`); } - console.log('Reading pdf'); + if(debug) console.log('Reading pdf'); // configuration const config = {}; config.paintFormXObject = false; config.paintImageMaskXObject = false; config.paintJpegXObject = false; - const data =fs.readFileSync(fpath); - console.log(data.length); + // read file + const data = fs.readFileSync(fpath); + if(debug) console.log(data.length); const doc = await pdf.getDocument({ data:data, - }); - console.log(`Pages : ${doc.numPages}`); - for(let pageNum = 1;pageNum<=doc.numPages;pageNum++) { + }).promise; + if(debug) console.log(`Pages : ${doc.numPages}`); + // read pages + for(let pageNum = 1; pageNum <= doc.numPages; pageNum++) { const page = await doc.getPage(pageNum); const viewport = page.getViewport({ scale: 1.0, }); //const text = await page.extractTextContent(); @@ -39,12 +41,9 @@ class GoldDigger { // load dependencies const dependencies = await this.loadDependencies(page, operatorList); const opTree = this.convertOpList(operatorList); - console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`); + if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`); const output = this.executeOpTree(opTree, page, dependencies, config); - console.log(`--- END Page ${pageNum} objects : ${output.length}`) - /*output.forEach(el => { - console.log(el.toString()) - });*/ + if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`) } } diff --git a/main.js b/main.js deleted file mode 100644 index a3023b4..0000000 --- a/main.js +++ /dev/null @@ -1,4 +0,0 @@ -const GoldDigger = require('./lib/GoldDigger'); - -const gd = new GoldDigger(); -gd.dig('../../github.com/pdf.js/web/compressed.tracemonkey-pldi-09.pdf'); \ No newline at end of file diff --git a/package.json b/package.json index cf559bd..39f7c7c 100644 --- a/package.json +++ b/package.json @@ -3,10 +3,11 @@ "version": "0.0.1", "description": "Pdf information extraction library based on pdf.js and node.js", "author": "Michal Szczepanski ", - "main": "main.js", + "main": "gd.js", "license": "MIT", "repository": "szczepano/pdf-gold-digger", "dependencies": { + "minimist": "^1.2.0", "pdfjs-dist": "^2.1.266" } }