From b3d9b10317a7411c1882a9484c79c92150a2a711 Mon Sep 17 00:00:00 2001 From: Michal Szczepanski Date: Mon, 22 Jul 2019 22:46:05 +0200 Subject: [PATCH] Add output formatter and json output --- README.md | 6 ++- gd.js | 18 +++++++- lib/GoldDigger.js | 10 +++++ lib/pdf/Constraints.js | 5 +++ lib/pdf/Executor.js | 2 +- lib/pdf/Extract.js | 4 +- lib/pdf/Formatter.js | 100 +++++++++++++++++++++++++++++++++++++++++ lib/pdf/Text.js | 14 +++--- 8 files changed, 148 insertions(+), 11 deletions(-) create mode 100644 lib/pdf/Constraints.js create mode 100644 lib/pdf/Formatter.js diff --git a/README.md b/README.md index 120f6f5..a0066cb 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ and [node.js](https://nodejs.org). ### Usage ``git clone https://github.com/vane/pdf-gold-digger`` -``gd -f some.pdf`` +``node gd.js -f some_file.pdf`` ### Supports: - extract text @@ -16,11 +16,13 @@ and [node.js](https://nodejs.org). - separate each line - separate font information - bounding box position +- output to text ``-o text (default)`` +- output to json ``-o json`` ### TODO: - specify output format and output directory - output to xml format -- output to json format +- ~~output to json format~~ - extract images to files - extract font - extract tables diff --git a/gd.js b/gd.js index 022d6ce..0bea2bf 100644 --- a/gd.js +++ b/gd.js @@ -2,9 +2,17 @@ const GoldDigger = require('./lib/GoldDigger'); const minimist = require('minimist'); const help = ` ---file -f pdf file location ---debug -d show debug information +--file -f pdf file location (required) +--debug -d show debug information (optional - default false) +--output -o output format (optional - default text) ` + +const supportedOutput = ['text', 'xml']; +const ERR_INVALID_OUTPUT = ` +Invalid output +Please specify one of those values : "${supportedOutput}" +` + // converts argument to boolean const toBool = (val) => { return val === 'true' || val === 1 || val === true; @@ -13,7 +21,12 @@ const toBool = (val) => { const argv = minimist(process.argv.slice(2)) const fpath = argv['file'] || argv['f']; let debug = argv['debug'] || argv['d']; +let output = argv['output'] || argv['o'] || 'text'; debug = toBool(debug); +if(output && supportedOutput.indexOf(output) < 0) { + console.error(ERR_INVALID_OUTPUT); + return; +} if(!fpath) { console.log(help); console.log(argv); @@ -26,6 +39,7 @@ const config = {}; config.paintFormXObject = false; config.paintImageMaskXObject = false; config.paintJpegXObject = false; +config.output = output; const gd = new GoldDigger(config); gd.dig(fpath, debug) diff --git a/lib/GoldDigger.js b/lib/GoldDigger.js index 2a942c1..44a5fe0 100644 --- a/lib/GoldDigger.js +++ b/lib/GoldDigger.js @@ -2,6 +2,7 @@ const fs = require('fs'); const pdf = require('pdfjs-dist'); const Extract = require('./pdf/Extract'); const Executor = require('./pdf/Executor'); +const Formatter = require('./pdf/Formatter'); class GoldDiggerError extends Error{ @@ -11,7 +12,9 @@ class GoldDiggerError extends Error{ class GoldDigger { constructor(config) { + this.config = config; this.executor = new Executor(config); + this.formatter = new Formatter() } async dig(fpath, debug) { @@ -26,6 +29,10 @@ class GoldDigger { data:data, }).promise; if(debug) console.log(`Pages : ${doc.numPages}`); + // prepare formatting + const format = this.config.output; + const metadata = await doc.getMetadata(); + this.formatter.start(format, doc, metadata.info); // read pages for(let pageNum = 1; pageNum <= doc.numPages; pageNum++) { const page = await doc.getPage(pageNum); @@ -38,8 +45,11 @@ class GoldDigger { const opTree = this.convertOpList(operatorList); if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`); const output = this.executeOpTree(opTree, page, dependencies); + const last = pageNum == doc.numPages; + this.formatter.format(format, page, output, last); if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`) } + this.formatter.end(format); } async loadDependencies(page, operatorList) { diff --git a/lib/pdf/Constraints.js b/lib/pdf/Constraints.js new file mode 100644 index 0000000..0ca79c8 --- /dev/null +++ b/lib/pdf/Constraints.js @@ -0,0 +1,5 @@ +const IDENTITY_MATRIX = [1, 0, 0, 1, 0, 0]; + +module.exports = { + IDENTITY_MATRIX, +} \ No newline at end of file diff --git a/lib/pdf/Executor.js b/lib/pdf/Executor.js index c64d842..d31ca53 100644 --- a/lib/pdf/Executor.js +++ b/lib/pdf/Executor.js @@ -61,7 +61,7 @@ class Executor { const newLine = el.isNewLine(y); // new line if(newLine) { - el.printText(); + if(this.config.output === 'text') el.printText(); el = this.currentObject.newLine(); } // create new text element always after new line diff --git a/lib/pdf/Extract.js b/lib/pdf/Extract.js index c79191c..b453927 100644 --- a/lib/pdf/Extract.js +++ b/lib/pdf/Extract.js @@ -1,4 +1,5 @@ const FontObject = require('./FontObject'); +const Constraints = require('./Constraints'); class ExtractText { getText(glyphs, line) { @@ -23,7 +24,8 @@ class ExtractText { } partial += glyph.unicode; const width = glyph.width; - const widthAdvanceScale = font.size * line.fontMatrix[0]; + // const widthAdvanceScale = font.size * line.fontMatrix[0]; + const widthAdvanceScale = font.size * Constraints.IDENTITY_MATRIX[0]; const charWidth = width * widthAdvanceScale + spacing * font.direction; x += charWidth; } diff --git a/lib/pdf/Formatter.js b/lib/pdf/Formatter.js new file mode 100644 index 0000000..f98aa53 --- /dev/null +++ b/lib/pdf/Formatter.js @@ -0,0 +1,100 @@ +class Formatter { + + constructor() { + this.formatters = { + json: new FormatterJSON(), + xml: new FormatterXML(), + } + } + + start(format, doc, metadata) { + const o = this.formatters[format].start(doc, metadata); + console.log(o); + } + + format(format, page, data, last) { + const o = this.formatters[format].format(page, data, last); + console.log(o); + } + + end(format) { + const o = this.formatters[format].end(); + console.log(o); + } +} + +class FormatterJSON { + start(doc, metadata) { + const meta = JSON.stringify(metadata) + return `{ + "pages_count": ${doc.numPages}, + "metadata": ${meta}, + "pages": { + ` + } + + format(page, data, last) { + const txtData = []; + data.forEach(textObject => { + const txtObjOut = {lines: [], x: textObject.x, y: textObject.y}; + textObject.getData().forEach(textLine => { + const txtLineOut = { + text: [], + x: textLine.x, + y: textLine.y, + w: textLine.w, + h: textLine.h, + textMatrix: textLine.textMatrix, + } + textLine.getData().forEach(textFont => { + const font = textFont.getFont(); + txtLineOut.text.push({ + font: { + size: font.size, + direction: font.direction, + family: font.family, + size: font.size, + style: font.style, + weight: font.weight, + }, + text: textFont.getText(), + charSpacing: textFont.charSpacing, + wordSpacing: textFont.wordSpacing, + }) + }); + txtObjOut.lines.push(txtLineOut); + }); + txtData.push(txtObjOut); + }); + let output = { + "data": txtData, + } + const out = JSON.stringify(output)// pretty print (output, null, 4) + return `"${page.pageIndex}": ${out}${last ? '': ','}` + } + + end() { + return `} + } + ` + } +} + +class FormatterXML { + start(doc, metadata) { + return ` + + ` + } + + format(page, data) { + const output = ''; + return output + } + + end() { + return '' + } +} + +module.exports = Formatter; \ No newline at end of file diff --git a/lib/pdf/Text.js b/lib/pdf/Text.js index 1c0fa24..935e7ae 100644 --- a/lib/pdf/Text.js +++ b/lib/pdf/Text.js @@ -1,7 +1,5 @@ const PdfObject = require('./PdfObject'); -const IDENTITY_MATRIX = [1, 0, 0, 1, 0, 0]; - class TextObject extends PdfObject { constructor() { super(); @@ -17,6 +15,10 @@ class TextObject extends PdfObject { getLine() { return this._textLines[this._textLines.length -1] } + + getData() { + return this._textLines; + } } @@ -73,6 +75,10 @@ class TextLine extends PdfObject { this._textFonts.forEach((el) => txt += el.getText()); console.log(txt); } + + getData() { + return this._textFonts; + } } class TextFont extends PdfObject { @@ -82,7 +88,6 @@ class TextFont extends PdfObject { this._text = ""; this.charSpacing = 0; this.wordSpacing = 0; - this.fontMatrix = IDENTITY_MATRIX; } getFont() { @@ -104,8 +109,7 @@ class TextFont extends PdfObject { equals(font) { return this.font === font.font && this.charSpacing === font.charSpacing - && this.wordSpacing === font.wordSpacing - && this.fontMatrix === font.fontMatrix; + && this.wordSpacing === font.wordSpacing; } }