Add proper command line interface for extraction

2019-07-22 20:58:24 +02:00 · 2019-07-22 20:58:24 +02:00 · 738aecf068
commit 738aecf068
parent 195ecdad63
5 changed files with 42 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -6,6 +6,10 @@ and [node.js](https://nodejs.org).

 ## Work in progress

+### Usage
+``git clone https://github.com/vane/pdf-gold-digger``  
+``gd -f some.pdf``
+
 ### Supports:
 - extract text
  - separate each page
--- a/gd.js
+++ b/gd.js
@ -0,0 +1,24 @@
+const GoldDigger = require('./lib/GoldDigger');
+const minimist = require('minimist');
+
+const help = `
+--file -f pdf file location
+--debug -d show debug information
+`
+// converts argument to boolean
+const toBool = (val) => {
+  return val === 'true' || val === 1 || val === true;
+}
+
+const argv = minimist(process.argv.slice(2))
+const fpath = argv['file'] || argv['f'];
+let debug = argv['debug'] || argv['d'];
+debug = toBool(debug);
+if(!fpath) {
+  console.log(help);
+  console.log(argv);
+  return;
+}
+if(debug) console.log(fpath);
+const gd = new GoldDigger();
+gd.dig(fpath, debug)
--- a/lib/GoldDigger.js
+++ b/lib/GoldDigger.js
@ -14,23 +14,25 @@ class GoldDigger {
    this.txt = new ExtractText();
  }

-  async dig(fpath) {
+  async dig(fpath, debug) {
    if (!fs.existsSync(fpath)) {
-      throw GoldDiggerError(`File not exists ${fpath}`);
+      throw new GoldDiggerError(`File not exists ${fpath}`);
    }
-    console.log('Reading pdf');
+    if(debug) console.log('Reading pdf');
    // configuration
    const config = {};
    config.paintFormXObject = false;
    config.paintImageMaskXObject = false;
    config.paintJpegXObject = false;
-    const data =fs.readFileSync(fpath);
-    console.log(data.length);
+    // read file
+    const data = fs.readFileSync(fpath);
+    if(debug) console.log(data.length);
    const doc = await pdf.getDocument({
      data:data,
-    });
-    console.log(`Pages : ${doc.numPages}`);
-    for(let pageNum = 1;pageNum<=doc.numPages;pageNum++) {
+    }).promise;
+    if(debug) console.log(`Pages : ${doc.numPages}`);
+    // read pages
+    for(let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
      const page = await doc.getPage(pageNum);
      const viewport = page.getViewport({ scale: 1.0, });
      //const text = await page.extractTextContent();
@ -39,12 +41,9 @@ class GoldDigger {
      // load dependencies
      const dependencies = await this.loadDependencies(page, operatorList);
      const opTree = this.convertOpList(operatorList);
-      console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`);
+      if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`);
      const output = this.executeOpTree(opTree, page, dependencies, config);
-      console.log(`--- END Page ${pageNum} objects : ${output.length}`)
-      /*output.forEach(el => {
-        console.log(el.toString())
-      });*/
+      if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`)
    }
  }

--- a/main.js
+++ b/main.js
@ -1,4 +0,0 @@
-const GoldDigger = require('./lib/GoldDigger');
-
-const gd = new GoldDigger();
-gd.dig('../../github.com/pdf.js/web/compressed.tracemonkey-pldi-09.pdf');
--- a/package.json
+++ b/package.json
@ -3,10 +3,11 @@
  "version": "0.0.1",
  "description": "Pdf information extraction library based on pdf.js and node.js",
  "author": "Michal Szczepanski <michal@vane.pl>",
-  "main": "main.js",
+  "main": "gd.js",
  "license": "MIT",
  "repository": "szczepano/pdf-gold-digger",
  "dependencies": {
+    "minimist": "^1.2.0",
    "pdfjs-dist": "^2.1.266"
  }
 }