Add image data information to output xml, json formatters

This commit is contained in:
Michal Szczepanski 2019-07-28 17:27:02 +02:00
parent e8685c78af
commit 0519406a25
7 changed files with 119 additions and 13 deletions

@ -1,3 +1,5 @@
const Model = require('../model');
/**
* Format PDF into json data
*/
@ -43,6 +45,20 @@ class FormatterJSON {
return txtObjOut;
}
/**
* Format image object
* @param {ImageObject} imageObject
*/
formatImageObject (imageObject) {
return {
x: imageObject.x,
y: imageObject.y,
width: imageObject.width,
height: imageObject.height,
name: imageObject.name,
};
}
/**
* Formats {@link TextLine} to JSON serializable object
* @param {TextLine} textLine
@ -92,14 +108,20 @@ class FormatterJSON {
*/
format (page, data, last) {
const txtData = [];
data.forEach(textObject => {
const txtObjOut = this.formatTextObject(textObject);
txtData.push(txtObjOut);
data.forEach(pdfObject => {
if (pdfObject instanceof Model.TextObject) {
const txtObjOut = this.formatTextObject(pdfObject);
txtData.push(txtObjOut);
} else if (pdfObject instanceof Model.ImageObject) {
const imgObjOut = this.formatImageObject(pdfObject);
txtData.push(imgObjOut);
} else {
console.warn(`Not recognised object ${pdfObject}`);
}
});
const output = {
const out = JSON.stringify({
data: txtData,
};
const out = JSON.stringify(output); // pretty print (output, null, 4)
}); // pretty print (output, null, 4)
return `"${page.pageIndex}": ${out}${last ? '' : ','}`;
}

@ -1,3 +1,4 @@
const Model = require('../model');
/**
* Format PDF into text data
*/
@ -64,9 +65,14 @@ class FormatterText {
*/
format (page, data, last) {
let output = '';
data.forEach(textObject => {
const txtObjOut = this.formatTextObject(textObject);
output += txtObjOut;
data.forEach(pdfObject => {
if (pdfObject instanceof Model.TextObject) {
output += this.formatTextObject(pdfObject);
} else if (pdfObject instanceof Model.ImageObject) {
// TODO add config to save image position
} else {
console.warn(`Not recognised object ${pdfObject}`);
}
});
return output;
}

@ -1,3 +1,5 @@
const Model = require('../model');
/**
* Format PDF into xml data
*/
@ -30,10 +32,22 @@ class FormatterXML {
lines.forEach(textLine => {
txtObjOut += this.formatTextLine(textLine);
});
txtObjOut += '</object>';
txtObjOut += '</object>\n';
return txtObjOut;
}
/**
* Format image object
* @param {ImageObject} imageObject
*/
formatImageObject (imageObject) {
return `<image
x="${imageObject.x}"
y="${imageObject.y}"
width="${imageObject.width}"
height="${imageObject.height}">${imageObject.name}</image>\n`;
}
/**
* Formats {@link TextLine} to xml object
* @param {TextLine} textLine
@ -76,8 +90,14 @@ class FormatterXML {
*/
format (page, data, last) {
let out = '<data>\n';
data.forEach(textObject => {
out += this.formatTextObject(textObject) + '\n';
data.forEach(pdfObject => {
if (pdfObject instanceof Model.TextObject) {
out += this.formatTextObject(pdfObject);
} else if (pdfObject instanceof Model.ImageObject) {
out += this.formatImageObject(pdfObject);
} else {
console.warn(`Not recognised object ${pdfObject}`);
}
});
out += '</data>';
return out;

@ -0,0 +1,43 @@
const PdfObject = require('./PdfObject');
/**
* Information about images
*/
class ImageObject extends PdfObject {
/**
* Constructor
*/
constructor () {
super();
/**
* @type {string} saved image name
*/
this.name = '';
/**
* @type {number} document image width
*/
this.width = 0;
/**
* @type {number} document image height
*/
this.height = 0;
}
/**
* Fill properties about this object
* @param {string} name
* @param {number} x
* @param {number} y
* @param {number} width
* @param {number} height
*/
fill (name, x, y, width, height) {
this.name = name;
this.x = x;
this.y = y;
this.width = width;
this.height = height;
}
}
module.exports = ImageObject;

@ -33,6 +33,14 @@ class PdfPage extends PdfObject {
this.fonts = {};
}
/**
* Add image to object list
* @param {ImageObject} image
*/
addImage (image) {
this.objectList.push(image);
}
/**
* Set current object and add it to objectList
* @param {TextObject} obj

@ -1,5 +1,6 @@
const FontObject = require('./FontObject');
const PdfObject = require('./PdfObject');
const ImageObject = require('./ImageObject');
const PdfPage = require('./PdfPage');
const TextObject = require('./TextObject');
const TextFont = require('./text/TextFont');
@ -12,4 +13,5 @@ module.exports = {
TextFont,
TextLine,
PdfPage,
ImageObject,
};

@ -1,4 +1,5 @@
const VisitorBase = require('./VisitorBase');
const Model = require('../model');
const pdfjs = require('../../pdfjs');
const FileManager = require('../FileManager');
@ -59,7 +60,11 @@ class VisitorImage extends VisitorBase {
// TODO imlement mask
const mask = false;
const imgBinary = pdfjs.convertImgDataToPng(imgData, this.forceDataSchema, !!mask);
const fpath = `${this.config.outputDir}/img/page.${this.page.data.pageIndex}.${args[1]}.png`;
const fname = `page.${this.page.data.pageIndex}.${args[1]}.png`;
const fpath = `${this.config.outputDir}/img/${fname}`;
const image = new Model.ImageObject();
image.fill(fname, this.page.x, this.page.y, imgData.width, imgData.height);
this.page.addImage(image);
await FileManager.saveFileAsync(fpath, imgBinary);
}
}