Rename Executor to Visitor, fix GoldDiger digPage

This commit is contained in:
Michal Szczepanski 2019-07-22 23:32:51 +02:00
parent cbfdc29fde
commit 2f4fa0c474
2 changed files with 35 additions and 24 deletions

View File

@ -1,7 +1,7 @@
const fs = require('fs'); const fs = require('fs');
const pdf = require('pdfjs-dist'); const pdf = require('pdfjs-dist');
const Extract = require('./pdf/Extract'); const Extract = require('./pdf/Extract');
const Executor = require('./pdf/Executor'); const Visitor = require('./pdf/Visitor');
const Formatter = require('./pdf/Formatter'); const Formatter = require('./pdf/Formatter');
class GoldDiggerError extends Error{ class GoldDiggerError extends Error{
@ -16,7 +16,7 @@ class GoldDigger {
constructor(config) { constructor(config) {
this.config = config; this.config = config;
this.executor = new Executor(config); this.visitor = new Visitor(config);
this.formatter = new Formatter() this.formatter = new Formatter()
} }
@ -55,7 +55,13 @@ class GoldDigger {
this.formatter.start(format, doc, metadata.info); this.formatter.start(format, doc, metadata.info);
// read pages // read pages
for(let pageNum = 1; pageNum <= doc.numPages; pageNum++) { for(let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
this.digPage(doc, pageNum); const page = await doc.getPage(pageNum);
const viewport = page.getViewport({ scale: 1.0, });
if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`);
const output = await this.digPage(page, pageNum);
const last = pageNum == doc.numPages;
this.formatter.format(format, page, output, last);
if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`)
} }
this.formatter.end(format); this.formatter.end(format);
} }
@ -65,20 +71,16 @@ class GoldDigger {
* @param doc - pdf document * @param doc - pdf document
* @param pageNum - page number * @param pageNum - page number
*/ */
digPage(doc, pageNum) { async digPage(page, pageNum) {
const page = await doc.getPage(pageNum);
const viewport = page.getViewport({ scale: 1.0, });
//const text = await page.extractTextContent(); //const text = await page.extractTextContent();
const operatorList = await page.getOperatorList(); const operatorList = await page.getOperatorList();
// page.commonObjs, page.objs // page.commonObjs, page.objs
// load dependencies // load dependencies
const dependencies = await this.loadDependencies(page, operatorList); const dependencies = await this.loadDependencies(page, operatorList);
const opTree = this.convertOpList(operatorList); const opTree = this.convertOpList(operatorList);
if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`);
const output = this.executeOpTree(opTree, page, dependencies); const output = this.executeOpTree(opTree, page, dependencies);
const last = pageNum == doc.numPages; return output;
this.formatter.format(format, page, output, last);
if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`)
} }
/** /**
@ -171,7 +173,7 @@ class GoldDigger {
* @returns {Array} PDFObject array * @returns {Array} PDFObject array
*/ */
executeOpTree(opTree, page, dependencies) { executeOpTree(opTree, page, dependencies) {
const debug = this.executor.debug; const debug = this.visitor.debug;
for (const opTreeElement of opTree) { for (const opTreeElement of opTree) {
const fn = opTreeElement.fn; const fn = opTreeElement.fn;
const fnId = opTreeElement.fnId; const fnId = opTreeElement.fnId;
@ -179,7 +181,7 @@ class GoldDigger {
switch (fnId | 0) { switch (fnId | 0) {
case pdf.OPS.beginText: case pdf.OPS.beginText:
this.executor.beginText(args, page, dependencies); this.visitor.beginText(args, page, dependencies);
break; break;
case pdf.OPS.dependency: case pdf.OPS.dependency:
// Handled in `loadDependencies`, so no warning should be shown. // Handled in `loadDependencies`, so no warning should be shown.
@ -193,23 +195,23 @@ class GoldDigger {
//this.setLeadingMoveText(args[0], args[1]); //this.setLeadingMoveText(args[0], args[1]);
break; break;
case pdf.OPS.setFont: case pdf.OPS.setFont:
this.executor.setFont(args, page, dependencies); this.visitor.setFont(args, page, dependencies);
//this.setFont(args); //this.setFont(args);
break; break;
case pdf.OPS.showText: case pdf.OPS.showText:
this.executor.showText(args, page, dependencies); this.visitor.showText(args, page, dependencies);
break; break;
case pdf.OPS.showSpacedText: case pdf.OPS.showSpacedText:
this.executor.showSpacedText(args, page, dependencies); this.visitor.showSpacedText(args, page, dependencies);
break; break;
case pdf.OPS.endText: case pdf.OPS.endText:
this.executor.endText(args, page, dependencies); this.visitor.endText(args, page, dependencies);
break; break;
case pdf.OPS.moveText: case pdf.OPS.moveText:
this.executor.moveText(args, page, dependencies); this.visitor.moveText(args, page, dependencies);
break; break;
case pdf.OPS.setTextMatrix: case pdf.OPS.setTextMatrix:
this.executor.setTextMatrix(args, page, dependencies); this.visitor.setTextMatrix(args, page, dependencies);
break; break;
case pdf.OPS.setCharSpacing: case pdf.OPS.setCharSpacing:
//this.setCharSpacing(args[0]); //this.setCharSpacing(args[0]);
@ -334,11 +336,11 @@ class GoldDigger {
//this.paintImageMaskXObject(args[0]); //this.paintImageMaskXObject(args[0]);
break; break;
case pdf.OPS.paintFormXObjectBegin: case pdf.OPS.paintFormXObjectBegin:
this.executor.paintFormXObjectBegin(args, page, dependencies); this.visitor.paintFormXObjectBegin(args, page, dependencies);
//this.paintFormXObjectBegin(args[0], args[1]); //this.paintFormXObjectBegin(args[0], args[1]);
break; break;
case pdf.OPS.paintFormXObjectEnd: case pdf.OPS.paintFormXObjectEnd:
this.executor.paintFormXObjectEnd(args, page, dependencies); this.visitor.paintFormXObjectEnd(args, page, dependencies);
//this.paintFormXObjectEnd(); //this.paintFormXObjectEnd();
break; break;
case pdf.OPS.closePath: case pdf.OPS.closePath:
@ -384,7 +386,7 @@ class GoldDigger {
break; break;
} }
} }
return this.executor.objectList; return this.visitor.objectList;
} }
} }

View File

@ -1,7 +1,10 @@
const Extract = require('./Extract'); const Extract = require('./Extract');
const Text = require('./Text'); const Text = require('./Text');
class Executor { /**
* Visits pdf.OPT.* methods using pdf page data
*/
class Visitor {
constructor (config, debug) { constructor (config, debug) {
this.txt = new Extract.ExtractText(); this.txt = new Extract.ExtractText();
@ -13,6 +16,12 @@ class Executor {
this.currentFont; this.currentFont;
} }
/**
*
* @param args
* @param page
* @param dependencies
*/
beginText(args, page, dependencies) { beginText(args, page, dependencies) {
if (this.debug) console.log('beginText'); if (this.debug) console.log('beginText');
if (this.skip) return; if (this.skip) return;
@ -96,4 +105,4 @@ class Executor {
} }
} }
module.exports = Executor module.exports = Visitor