Add output formatter and json output

This commit is contained in:
Michal Szczepanski 2019-07-22 22:46:05 +02:00
parent 54af1b3753
commit b3d9b10317
8 changed files with 148 additions and 11 deletions

View File

@ -8,7 +8,7 @@ and [node.js](https://nodejs.org).
### Usage
``git clone https://github.com/vane/pdf-gold-digger``
``gd -f some.pdf``
``node gd.js -f some_file.pdf``
### Supports:
- extract text
@ -16,11 +16,13 @@ and [node.js](https://nodejs.org).
- separate each line
- separate font information
- bounding box position
- output to text ``-o text (default)``
- output to json ``-o json``
### TODO:
- specify output format and output directory
- output to xml format
- output to json format
- ~~output to json format~~
- extract images to files
- extract font
- extract tables

18
gd.js
View File

@ -2,9 +2,17 @@ const GoldDigger = require('./lib/GoldDigger');
const minimist = require('minimist');
const help = `
--file -f pdf file location
--debug -d show debug information
--file -f pdf file location (required)
--debug -d show debug information (optional - default false)
--output -o output format (optional - default text)
`
const supportedOutput = ['text', 'xml'];
const ERR_INVALID_OUTPUT = `
Invalid output
Please specify one of those values : "${supportedOutput}"
`
// converts argument to boolean
const toBool = (val) => {
return val === 'true' || val === 1 || val === true;
@ -13,7 +21,12 @@ const toBool = (val) => {
const argv = minimist(process.argv.slice(2))
const fpath = argv['file'] || argv['f'];
let debug = argv['debug'] || argv['d'];
let output = argv['output'] || argv['o'] || 'text';
debug = toBool(debug);
if(output && supportedOutput.indexOf(output) < 0) {
console.error(ERR_INVALID_OUTPUT);
return;
}
if(!fpath) {
console.log(help);
console.log(argv);
@ -26,6 +39,7 @@ const config = {};
config.paintFormXObject = false;
config.paintImageMaskXObject = false;
config.paintJpegXObject = false;
config.output = output;
const gd = new GoldDigger(config);
gd.dig(fpath, debug)

View File

@ -2,6 +2,7 @@ const fs = require('fs');
const pdf = require('pdfjs-dist');
const Extract = require('./pdf/Extract');
const Executor = require('./pdf/Executor');
const Formatter = require('./pdf/Formatter');
class GoldDiggerError extends Error{
@ -11,7 +12,9 @@ class GoldDiggerError extends Error{
class GoldDigger {
constructor(config) {
this.config = config;
this.executor = new Executor(config);
this.formatter = new Formatter()
}
async dig(fpath, debug) {
@ -26,6 +29,10 @@ class GoldDigger {
data:data,
}).promise;
if(debug) console.log(`Pages : ${doc.numPages}`);
// prepare formatting
const format = this.config.output;
const metadata = await doc.getMetadata();
this.formatter.start(format, doc, metadata.info);
// read pages
for(let pageNum = 1; pageNum <= doc.numPages; pageNum++) {
const page = await doc.getPage(pageNum);
@ -38,8 +45,11 @@ class GoldDigger {
const opTree = this.convertOpList(operatorList);
if(debug) console.log(`--- BEGIN Page ${pageNum} size: ${viewport.width}x${viewport.height}`);
const output = this.executeOpTree(opTree, page, dependencies);
const last = pageNum == doc.numPages;
this.formatter.format(format, page, output, last);
if(debug) console.log(`--- END Page ${pageNum} objects : ${output.length}`)
}
this.formatter.end(format);
}
async loadDependencies(page, operatorList) {

5
lib/pdf/Constraints.js Normal file
View File

@ -0,0 +1,5 @@
const IDENTITY_MATRIX = [1, 0, 0, 1, 0, 0];
module.exports = {
IDENTITY_MATRIX,
}

View File

@ -61,7 +61,7 @@ class Executor {
const newLine = el.isNewLine(y);
// new line
if(newLine) {
el.printText();
if(this.config.output === 'text') el.printText();
el = this.currentObject.newLine();
}
// create new text element always after new line

View File

@ -1,4 +1,5 @@
const FontObject = require('./FontObject');
const Constraints = require('./Constraints');
class ExtractText {
getText(glyphs, line) {
@ -23,7 +24,8 @@ class ExtractText {
}
partial += glyph.unicode;
const width = glyph.width;
const widthAdvanceScale = font.size * line.fontMatrix[0];
// const widthAdvanceScale = font.size * line.fontMatrix[0];
const widthAdvanceScale = font.size * Constraints.IDENTITY_MATRIX[0];
const charWidth = width * widthAdvanceScale + spacing * font.direction;
x += charWidth;
}

100
lib/pdf/Formatter.js Normal file
View File

@ -0,0 +1,100 @@
class Formatter {
constructor() {
this.formatters = {
json: new FormatterJSON(),
xml: new FormatterXML(),
}
}
start(format, doc, metadata) {
const o = this.formatters[format].start(doc, metadata);
console.log(o);
}
format(format, page, data, last) {
const o = this.formatters[format].format(page, data, last);
console.log(o);
}
end(format) {
const o = this.formatters[format].end();
console.log(o);
}
}
class FormatterJSON {
start(doc, metadata) {
const meta = JSON.stringify(metadata)
return `{
"pages_count": ${doc.numPages},
"metadata": ${meta},
"pages": {
`
}
format(page, data, last) {
const txtData = [];
data.forEach(textObject => {
const txtObjOut = {lines: [], x: textObject.x, y: textObject.y};
textObject.getData().forEach(textLine => {
const txtLineOut = {
text: [],
x: textLine.x,
y: textLine.y,
w: textLine.w,
h: textLine.h,
textMatrix: textLine.textMatrix,
}
textLine.getData().forEach(textFont => {
const font = textFont.getFont();
txtLineOut.text.push({
font: {
size: font.size,
direction: font.direction,
family: font.family,
size: font.size,
style: font.style,
weight: font.weight,
},
text: textFont.getText(),
charSpacing: textFont.charSpacing,
wordSpacing: textFont.wordSpacing,
})
});
txtObjOut.lines.push(txtLineOut);
});
txtData.push(txtObjOut);
});
let output = {
"data": txtData,
}
const out = JSON.stringify(output)// pretty print (output, null, 4)
return `"${page.pageIndex}": ${out}${last ? '': ','}`
}
end() {
return `}
}
`
}
}
class FormatterXML {
start(doc, metadata) {
return `<?xml version="1.0" encoding="UTF-8"?>
<document>
`
}
format(page, data) {
const output = '';
return output
}
end() {
return '</document>'
}
}
module.exports = Formatter;

View File

@ -1,7 +1,5 @@
const PdfObject = require('./PdfObject');
const IDENTITY_MATRIX = [1, 0, 0, 1, 0, 0];
class TextObject extends PdfObject {
constructor() {
super();
@ -17,6 +15,10 @@ class TextObject extends PdfObject {
getLine() {
return this._textLines[this._textLines.length -1]
}
getData() {
return this._textLines;
}
}
@ -73,6 +75,10 @@ class TextLine extends PdfObject {
this._textFonts.forEach((el) => txt += el.getText());
console.log(txt);
}
getData() {
return this._textFonts;
}
}
class TextFont extends PdfObject {
@ -82,7 +88,6 @@ class TextFont extends PdfObject {
this._text = "";
this.charSpacing = 0;
this.wordSpacing = 0;
this.fontMatrix = IDENTITY_MATRIX;
}
getFont() {
@ -104,8 +109,7 @@ class TextFont extends PdfObject {
equals(font) {
return this.font === font.font
&& this.charSpacing === font.charSpacing
&& this.wordSpacing === font.wordSpacing
&& this.fontMatrix === font.fontMatrix;
&& this.wordSpacing === font.wordSpacing;
}
}