From 2f4fa0c474301de3a77b236f3bdd317583c85d22 Mon Sep 17 00:00:00 2001 From: Michal Szczepanski Date: Mon, 22 Jul 2019 23:32:51 +0200 Subject: [PATCH] Rename Executor to Visitor, fix GoldDiger digPage --- lib/GoldDigger.js | 44 +++++++++++++++-------------- lib/pdf/{Executor.js => Visitor.js} | 15 ++++++++-- 2 files changed, 35 insertions(+), 24 deletions(-) rename lib/pdf/{Executor.js => Visitor.js} (94%) diff --git a/lib/GoldDigger.js b/lib/GoldDigger.js index f3f226c..16a7883 100644 --- a/lib/GoldDigger.js +++ b/lib/GoldDigger.js @@ -1,7 +1,7 @@ const fs = require('fs'); const pdf = require('pdfjs-dist'); const Extract = require('./pdf/Extract'); -const Executor = require('./pdf/Executor'); +const Visitor = require('./pdf/Visitor'); const Formatter = require('./pdf/Formatter'); class GoldDiggerError extends Error{ @@ -16,7 +16,7 @@ class GoldDigger { constructor(config) { this.config = config; - this.executor = new Executor(config); + this.visitor = new Visitor(config); this.formatter = new Formatter() } @@ -55,7 +55,13 @@ class GoldDigger { this.formatter.start(format, doc, metadata.info); // read pages for(let pageNum = 1; pageNum <= doc.numPages; pageNum++) { - this.digPage(doc, pageNum); + const page = await doc.getPage(pageNum); + const viewport = page.getViewport({ scale: 1.0, }); + if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`); + const output = await this.digPage(page, pageNum); + const last = pageNum == doc.numPages; + this.formatter.format(format, page, output, last); + if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`) } this.formatter.end(format); } @@ -65,20 +71,16 @@ class GoldDigger { * @param doc - pdf document * @param pageNum - page number */ - digPage(doc, pageNum) { - const page = await doc.getPage(pageNum); - const viewport = page.getViewport({ scale: 1.0, }); + async digPage(page, pageNum) { + //const text = await page.extractTextContent(); const operatorList = await page.getOperatorList(); // page.commonObjs, page.objs // load dependencies const dependencies = await this.loadDependencies(page, operatorList); const opTree = this.convertOpList(operatorList); - if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`); const output = this.executeOpTree(opTree, page, dependencies); - const last = pageNum == doc.numPages; - this.formatter.format(format, page, output, last); - if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`) + return output; } /** @@ -171,7 +173,7 @@ class GoldDigger { * @returns {Array} PDFObject array */ executeOpTree(opTree, page, dependencies) { - const debug = this.executor.debug; + const debug = this.visitor.debug; for (const opTreeElement of opTree) { const fn = opTreeElement.fn; const fnId = opTreeElement.fnId; @@ -179,7 +181,7 @@ class GoldDigger { switch (fnId | 0) { case pdf.OPS.beginText: - this.executor.beginText(args, page, dependencies); + this.visitor.beginText(args, page, dependencies); break; case pdf.OPS.dependency: // Handled in `loadDependencies`, so no warning should be shown. @@ -193,23 +195,23 @@ class GoldDigger { //this.setLeadingMoveText(args[0], args[1]); break; case pdf.OPS.setFont: - this.executor.setFont(args, page, dependencies); + this.visitor.setFont(args, page, dependencies); //this.setFont(args); break; case pdf.OPS.showText: - this.executor.showText(args, page, dependencies); + this.visitor.showText(args, page, dependencies); break; case pdf.OPS.showSpacedText: - this.executor.showSpacedText(args, page, dependencies); + this.visitor.showSpacedText(args, page, dependencies); break; case pdf.OPS.endText: - this.executor.endText(args, page, dependencies); + this.visitor.endText(args, page, dependencies); break; case pdf.OPS.moveText: - this.executor.moveText(args, page, dependencies); + this.visitor.moveText(args, page, dependencies); break; case pdf.OPS.setTextMatrix: - this.executor.setTextMatrix(args, page, dependencies); + this.visitor.setTextMatrix(args, page, dependencies); break; case pdf.OPS.setCharSpacing: //this.setCharSpacing(args[0]); @@ -334,11 +336,11 @@ class GoldDigger { //this.paintImageMaskXObject(args[0]); break; case pdf.OPS.paintFormXObjectBegin: - this.executor.paintFormXObjectBegin(args, page, dependencies); + this.visitor.paintFormXObjectBegin(args, page, dependencies); //this.paintFormXObjectBegin(args[0], args[1]); break; case pdf.OPS.paintFormXObjectEnd: - this.executor.paintFormXObjectEnd(args, page, dependencies); + this.visitor.paintFormXObjectEnd(args, page, dependencies); //this.paintFormXObjectEnd(); break; case pdf.OPS.closePath: @@ -384,7 +386,7 @@ class GoldDigger { break; } } - return this.executor.objectList; + return this.visitor.objectList; } } diff --git a/lib/pdf/Executor.js b/lib/pdf/Visitor.js similarity index 94% rename from lib/pdf/Executor.js rename to lib/pdf/Visitor.js index d31ca53..8cbd7ba 100644 --- a/lib/pdf/Executor.js +++ b/lib/pdf/Visitor.js @@ -1,7 +1,10 @@ const Extract = require('./Extract'); const Text = require('./Text'); -class Executor { +/** + * Visits pdf.OPT.* methods using pdf page data + */ +class Visitor { constructor (config, debug) { this.txt = new Extract.ExtractText(); @@ -12,7 +15,13 @@ class Executor { this.currentObject; this.currentFont; } - + + /** + * + * @param args + * @param page + * @param dependencies + */ beginText(args, page, dependencies) { if (this.debug) console.log('beginText'); if (this.skip) return; @@ -96,4 +105,4 @@ class Executor { } } -module.exports = Executor \ No newline at end of file +module.exports = Visitor \ No newline at end of file