Add FormatterHTML for html output, closes #15

This commit is contained in:
Michal Szczepanski 2019-07-28 19:25:37 +02:00
parent 9b0574777f
commit e4f44a5642
5 changed files with 123 additions and 6 deletions

@ -2,7 +2,7 @@ pdf-gold-digger
====
Pdf information extraction library based on [pdf.js](https://mozilla.github.io/pdf.js/)
and [node.js](https://nodejs.org).
and [node.js](https://nodejs.org) with various output formats.
![GitHub](https://img.shields.io/github/license/vane/pdf-gold-digger)
![npm](https://img.shields.io/npm/v/pdf-gold-digger)
@ -27,8 +27,8 @@ ex. pdfdig -i input-file -o output_directory -f json
--input or -i pdf file location (required)
--output or -o pdf file location (optional default "out")
--debug or -d show debug information (optional - default "false")
--format or -f format (optional - default "text") - ("text,json,xml")
--font or -t extract fonts as ttf files
--format or -f format (optional - default "text") - ("text,json,xml,html")
--font or -t extract fonts as ttf files (optional)
--help or -h display this help message
```
@ -52,12 +52,12 @@ and see results in ```out``` directory
- text ```-f text (default)```
- json ```-f json```
- xml ```-f xml```
- html ```-f html```
- specify output directory
## TODO:
- load pdf from remote location
- from url
- output to html format
- output to markdown format
- output to zip
- extract tables

2
gd.js

@ -3,7 +3,7 @@ const GoldDigger = require('./src/GoldDigger');
const FileManager = require('./src/pdf/FileManager');
const supportedFormat = ['text', 'json', 'xml'];
const supportedFormat = ['text', 'json', 'xml', 'html'];
const ERR_INVALID_FORMAT = `
Invalid output
Please specify one of those values : "${supportedFormat}"

@ -11,12 +11,13 @@ class Formatter {
constructor (config) {
this.debug = config.debug;
/**
* @type {{json: FormatterJSON, xml: FormatterXML, text: FormatterText}}
* @type {{json: FormatterJSON, xml: FormatterXML, text: FormatterText, html: FormatterHTML}}
*/
this.formatters = {
json: new f.FormatterJSON(),
xml: new f.FormatterXML(),
text: new f.FormatterText(),
html: new f.FormatterHTML(),
};
this.data = '';
}

@ -0,0 +1,114 @@
const Model = require('../model');
/**
* Format PDF into html data
*/
class FormatterHTML {
/**
* See {@link Formatter}
* @param doc
* @param metadata
* @returns {string}
*/
start (doc, metadata) {
return `<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
`;
}
/**
* Formats {@link TextObject} to xml object
* @param {TextObject} textObject
* @returns {object}
*/
formatTextObject (textObject) {
let txtObjOut = `<div>\n`;
const lines = textObject.getData();
lines.sort((a, b) => {
if (a.y > b.y) return -1;
if (a.y < b.y) return 1;
return 0;
});
lines.forEach(textLine => {
txtObjOut += this.formatTextLine(textLine);
});
txtObjOut += '</div>\n';
return txtObjOut;
}
/**
* Format image object
* @param {ImageObject} imageObject
*/
formatImageObject (imageObject) {
return `<img class="pdf-dig-img"
width="${imageObject.width}px"
height="${imageObject.height}px"
src="img/${imageObject.name}"/>\n`;
}
/**
* Formats {@link TextLine} to xml object
* @param {TextLine} textLine
* @returns {object}
*/
formatTextLine (textLine) {
let txtLineOut = `<p class="pdfdig-text-line">`;
textLine.getText().forEach(textFont => {
txtLineOut += this.formatTextFont(textFont);
});
txtLineOut += '</p>\n';
return txtLineOut;
}
/**
* Formats {@link TextFont} to xml object
* @param {TextFont} textFont
* @returns {object}
*/
formatTextFont (textFont) {
return `<span class="pdfdig-text-font" style="font-family: ${textFont.font.family};
font-size:${textFont.font.size}pt;
font-weight:${textFont.font.weight};">
${textFont.getText()}
</span>`;
}
/**
* See {@link Formatter}
* @param page
* @param data
* @param last
* @returns {string}
*/
format (page, data, last) {
let out = '<div class="pdfdig-pdf-object">\n';
data.forEach(pdfObject => {
if (pdfObject instanceof Model.TextObject) {
out += this.formatTextObject(pdfObject);
} else if (pdfObject instanceof Model.ImageObject) {
out += this.formatImageObject(pdfObject);
} else {
console.warn(`Not recognised object ${pdfObject}`);
}
});
out += '</div>';
return out;
}
/**
* See {@link Formatter}
* @returns {string}
*/
end () {
return `</body>
</html>`;
}
}
module.exports = FormatterHTML;

@ -1,9 +1,11 @@
const FormatterJSON = require('./FormatterJSON');
const FormatterText = require('./FormatterText');
const FormatterXML = require('./FormatterXML');
const FormatterHTML = require('./FormatterHTML');
module.exports = {
FormatterXML,
FormatterJSON,
FormatterText,
FormatterHTML,
};