Documentation update / move formatter, visitor to formatters, visitors

This commit is contained in:
Michal Szczepanski 2019-07-26 00:38:51 +02:00
parent d1400175a9
commit 892a8c7bb1
21 changed files with 191 additions and 89 deletions

@ -19,8 +19,15 @@ class GoldDiggerError extends Error{
*/
class GoldDigger {
/**
* Constructor
* @param {object} config - configuration
*/
constructor(config) {
this.config = config;
/**
* @type {Formatter}
*/
this.formatter = new Formatter(config);
}
@ -72,7 +79,7 @@ class GoldDigger {
/**
* Process page
* @param pageData - pdf page
* @param pageNum - page number
* @param {number} pageNum - page number
*/
async digPage(pageData, pageNum) {
@ -172,9 +179,7 @@ class GoldDigger {
* Process pdf file format
* Based on (SVGGraphics)
* @param opTree - pdf tree of information
* @param page - pdf page
* @param dependencies - loadDependencies data
* @returns {Array} PDFObject array
* @param {Visitor} visitor - class for parsing incoming tags
*/
executeOpTree(opTree, visitor) {
const debug = visitor.debug;

@ -8,8 +8,8 @@ const util = require('pdfjs-dist/lib/shared/util');
class ExtractText {
/**
* Return text from glyphs array
* @param glyphs - glyphs from pdf.OPS.showText, pdf.OPS.showSpacedText
* @param page - PdfPage object @see PdfPage
* @param {array} glyphs - glyphs from pdf.OPS.showText, pdf.OPS.showSpacedText
* @param {PdfPage} page - pdf page object
*/
showText(glyphs, page) {
// MOVED from VisitorText
@ -77,7 +77,7 @@ class ExtractText {
/**
* Find font family inside loadedDependencies based on font name
* @param name - font name
* @param {string} name - font name
* @param dependencies - pdf document data
* @returns {*} font from dependencies if found otherwise null
* (probably need to warn user for missing font inside document)
@ -92,9 +92,9 @@ class ExtractText {
}
/**
* Gets FontObject from page information when pdf.OPS.setFont
* Gets {@link FontObject} from page information when pdf.OPS.setFont
* @param details - arguments from pdf.OPS.setFont
* @param page - pdf page @see PdfPage
* @param {PdfPage} page - pdf page
*/
setFont(details, page) {
const fontObj = page.data.commonObjs.get(details[0]);

@ -21,7 +21,11 @@ const mkdirNotExists = (path) => {
fs.mkdirSync(path);
}
}
/**
* Reads directory
* @param {string} path - directory path
* @returns {string[]} - directory listing
*/
const readdirSync = (path) => {
return fs.readdirSync(path)
}

@ -1,18 +1,23 @@
const FormatterJSON = require('./formatter/FormatterJSON');
const FormatterXML = require('./formatter/FormatterXML');
const FormatterText = require('./formatter/FormatterText');
const f = require('./formatters');
/**
* Formats PDF to desired output
*/
class Formatter {
/**
* Constructor
* @param config
*/
constructor(config) {
this.debug = config.debug;
/**
* @type {{json: FormatterJSON, xml: FormatterXML, text: FormatterText}}
*/
this.formatters = {
json: new FormatterJSON(),
xml: new FormatterXML(),
text: new FormatterText(),
json: new f.FormatterJSON(),
xml: new f.FormatterXML(),
text: new f.FormatterText(),
}
this.data = "";
}

@ -1,4 +1,4 @@
const v = require('./visitor/index');
const v = require('./visitors');
const Model = require('./model');
const FN_TEXT = ['beginText', 'setFont', 'showText',
@ -6,7 +6,7 @@ const FN_TEXT = ['beginText', 'setFont', 'showText',
'setLeading', 'setLeadingMoveText', 'setCharSpacing',
'setWordSpacing', 'setHScale', 'setTextMatrix',
'setTextRise', 'setTextRenderingMode', 'nextLine'];
const FN_XOBJECT = ['setTextMatrix', 'paintFormXObjectBegin', 'paintFormXObjectEnd'];
const FN_XOBJECT = ['paintFormXObjectBegin', 'paintFormXObjectEnd'];
const FN_IMAGE = ['paintJpegXObject', 'paintImageXObject', 'paintInlineImageXObject', 'paintImageMaskXObject'];
/**
@ -14,12 +14,30 @@ const FN_IMAGE = ['paintJpegXObject', 'paintImageXObject', 'paintInlineImageXObj
*/
class Visitor {
/**
* Constructor
* @param {object} config - application configuration
* @param data - pdf data
* @param dependencies - pdf loaded dependencies
*/
constructor (config, data, dependencies) {
this.config = config;
this.config.skip = false;
/**
* @type {PdfPage}
*/
this.page = new Model.PdfPage(data, dependencies);
/**
* @type {VisitorText}
*/
this.txt = new v.VisitorText(config, this.page);
/**
* @type {VisitorXObject}
*/
this.xobject = new v.VisitorXObject(config, this.page);
/**
* @type {VisitorImage}
*/
this.image = new v.VisitorImage(config, this.page);
}

@ -1,19 +0,0 @@
class FormatterXML {
start(doc, metadata) {
return `<?xml version="1.0" encoding="UTF-8"?>
<document>
`
}
format(page, data) {
const output = '';
return output
}
end() {
return '</document>'
}
}
module.exports = FormatterXML;

@ -2,6 +2,12 @@
* Format PDF into json data
*/
class FormatterJSON {
/**
* See {@link Formatter}
* @param doc
* @param metadata
* @returns {string}
*/
start(doc, metadata) {
const meta = JSON.stringify(metadata)
return `{
@ -12,8 +18,8 @@ class FormatterJSON {
}
/**
* Formats {TextObject} to JSON serializable object
* @param textObject {TextObject}
* Formats {@link TextObject} to JSON serializable object
* @param {TextObject} textObject
* @returns {object}
*/
formatTextObject(textObject) {
@ -26,8 +32,8 @@ class FormatterJSON {
}
/**
* Formats {TextLine} to JSON serializable object
* @param textLine {TextLine}
* Formats {@link TextLine} to JSON serializable object
* @param {TextLine} textLine
* @returns {object}
*/
formatTextLine(textLine) {
@ -47,8 +53,8 @@ class FormatterJSON {
}
/**
* Formats {TextFont} to JSON serializable object
* @param textFont {TextFont}
* Formats {@link TextFont} to JSON serializable object
* @param {TextFont} textFont
* @returns {object}
*/
formatTextFont(textFont) {
@ -67,7 +73,7 @@ class FormatterJSON {
}
/**
* See {Formatter}
* See {@link Formatter}
* @param page
* @param data
* @param last
@ -87,7 +93,7 @@ class FormatterJSON {
}
/**
* See {Formatter}
* See {@link Formatter}
* @returns {string}
*/
end() {

@ -1,7 +1,9 @@
/**
* Format PDF into text data
*/
class FormatterText {
/**
* See {Formatter}
* See {@link Formatter}
* @param doc
* @param metadata
* @returns {string}
@ -11,8 +13,8 @@ class FormatterText {
}
/**
* Format {TextObject} to string
* @param textObject {TextObject}
* Format {@link TextObject} to string
* @param {TextObject} textObject
* @returns {string}
*/
formatTextObject(textObject) {
@ -25,8 +27,8 @@ class FormatterText {
}
/**
* Format {TextLine} to string
* @param textLine
* Format {@link TextLine} to string
* @param {TextLine} textLine
* @returns {string}
*/
formatTextLine(textLine) {
@ -38,8 +40,8 @@ class FormatterText {
}
/**
* Format {TextFont} to string
* @param textFont {TextFont}
* Format {@link TextFont} to string
* @param {TextFont} textFont
* @returns {string|TextFont|*}
*/
formatTextFont(textFont) {
@ -47,7 +49,7 @@ class FormatterText {
}
/**
* See {Formatter}
* See {@link Formatter}
* @param page
* @param data
* @param last
@ -63,7 +65,7 @@ class FormatterText {
}
/**
* See {Formatter}
* See {@link Formatter}
* @returns {string}
*/
end() {

@ -0,0 +1,38 @@
/**
* Format PDF into xml data
*/
class FormatterXML {
/**
* See {@link Formatter}
* @param doc
* @param metadata
* @returns {string}
*/
start(doc, metadata) {
return `<?xml version="1.0" encoding="UTF-8"?>
<document>
`
}
/**
* See {@link Formatter}
* @param page
* @param data
* @param last
* @returns {string}
*/
format(page, data, last) {
const output = '';
return output
}
/**
* See {@link Formatter}
* @returns {string}
*/
end() {
return '</document>'
}
}
module.exports = FormatterXML;

@ -2,21 +2,51 @@
* Store font information
*/
class FontObject {
/**
* Constructor
*/
constructor() {
/**
* @type {number}
*/
this.size = 0;
/**
* @type {number}
*/
this.sizeScale = 1;
/**
* @type {string}
*/
this.weight = 'normal';
/**
* @type {string}
*/
this.style = null;
/**
* @type {string}
*/
this.family = null;
/**
* @type {number}
*/
this.direction = 1;
/**
* @type {boolean}
*/
this.vertical = false;
/**
* @type {boolean}
*/
this.spaceWidthIsSet = false;
/**
* @type {number}
*/
this.spaceWidth = 250;
}
/**
* Set font size based on number if it's negative it will be other font direction
* @param size
* @param {number} size
*/
setSize(size) {
if (size < 0) {

@ -2,8 +2,19 @@
* Main class for handling pdf data
*/
class PdfObject {
/**
* Constructor
*/
constructor () {
/**
* x position of element
* @type {number}
*/
this.x = 0;
/**
* y position of element
* @type {number}
*/
this.y = 0;
}
}

@ -1,6 +1,15 @@
const PdfObject = require('./PdfObject');
/**
* Holds PDF page information
* @extends {PdfObject}
*/
class PdfPage extends PdfObject {
/**
* Constructor
* @param {object} data - pdf page data opcodes
* @param {object} dependencies - pdf loaded resources
*/
constructor (data, dependencies) {
super();
this.data = data;
@ -21,6 +30,10 @@ class PdfPage extends PdfObject {
this.currentFont;
}
/**
* Set current object and add it to objectList
* @param {TextObject} obj
*/
setCurrentObject(obj) {
this.currentObject = obj
this.objectList.push(obj);

@ -18,7 +18,7 @@ class TextObject extends PdfObject {
}
/**
* Create new {TextLine} adds it to array and returns it as value
* Create new {@link TextLine} adds it to array and returns it as value
* @returns {TextLine}
*/
newLine() {
@ -28,7 +28,7 @@ class TextObject extends PdfObject {
}
/**
* Return last {TextLine} from text array
* Return last {@link TextLine} from text array
* @returns {TextLine}
*/
getLine() {
@ -36,7 +36,7 @@ class TextObject extends PdfObject {
}
/**
* Return array of {TextLine}
* Return array of {@link TextLine}
* @returns {Array}
*/
getData() {

@ -2,6 +2,7 @@ const PdfObject = require('./../PdfObject');
/**
* Represents Font information in pdf file
* @extends {PdfObject}
*/
class TextFont extends PdfObject {
constructor() {
@ -25,6 +26,11 @@ class TextFont extends PdfObject {
}
}
/**
* Heuristic method to check if glyph number is space
* @param {number} glyph - numeric size
* @returns {boolean} - flag true if it's space false if it's not
*/
isSpace(glyph) {
if(-glyph >= this.font.spaceWidth) {
return true;
@ -44,17 +50,6 @@ class TextFont extends PdfObject {
getText() {
return this._text;
}
/**
* Check if provided {FontObject} is equal to existing one
* @param font {FontObject}
* @returns {boolean}
*/
equals(font) {
return this.font === font.font
&& this.charSpacing === font.charSpacing
&& this.wordSpacing === font.wordSpacing;
}
}
module.exports = TextFont;

@ -3,6 +3,7 @@ const TextFont = require('./../text/TextFont');
/**
* Represents text line in pdf file
* @extends {PdfObject}
*/
class TextLine extends PdfObject {
constructor() {
@ -14,14 +15,14 @@ class TextLine extends PdfObject {
/**
* Adds line with font to text
* @param line @see {TextFont}
* @param {TextFont} line
*/
addTextFont(line) {
this._textFonts.push(line);
}
/**
* Get last TextFont
* Get last {@link TextFont}
* @returns {TextFont}
*/
getLastFontText() {
@ -40,8 +41,8 @@ class TextLine extends PdfObject {
}
/**
* Get Array of all {TextFont} data inside this text line
* @returns {Array} of {TextFont}
* Get Array of all {@link TextFont} data inside this text line
* @returns {Array} of {@link TextFont}
*/
getData() {
return this._textFonts;

@ -5,7 +5,7 @@ class VisitorBase {
/**
* Constructor
* @param config - configuration
* @param page - PdfPage @see PdfPage
* @param {PdfPage} page
*/
constructor(config, page) {
this.config = config;

@ -4,6 +4,7 @@ const FileManager = require('../FileManager');
/**
* Visit image objects in pdf files
* @extends {VisitorBase}
*/
class VisitorImage extends VisitorBase {

@ -4,6 +4,7 @@ const VisitorBase = require('./VisitorBase');
/**
* Visits text data while parsing pdf
* @extends {VisitorBase}
*/
class VisitorText extends VisitorBase {
constructor(config, page) {
@ -138,7 +139,10 @@ class VisitorText extends VisitorBase {
if (this.debug) console.log('setTextRise');
if (this.config.skip) return;
}
/**
* pdf.OPS.setTextRenderingMode
*/
setTextRenderingMode(args) {
if (this.debug) console.log('setTextRenderingMode');
if (this.config.skip) return;

@ -3,22 +3,10 @@ const VisitorBase = require('./VisitorBase');
/**
* Visits XObject data when parsing pdf
* Currently only sets flag skip for XObjectBegin
* @extends {VisitorBase}
*/
class VisitorXObject extends VisitorBase {
/**
* pdf.OPS.setTextMatrix
*/
setTextMatrix(args) {
if (this.config.debug) console.log('setTextMatrix');
if (this.config.skip) return;
/*const a = args[0], b = args[1], c = args[2], d = args[3], e = args[4], f = args[5];
const el = this.currentObject.getLine();
// new text font
el.newText();
el.textMatrix = [a, b, c, d, e, f];*/
}
/**
* pdf.OPS.paintFormXObjectBegin
*/