Add FormatterHTML for html output, closes #15
This commit is contained in:
parent
9b0574777f
commit
e4f44a5642
@ -2,7 +2,7 @@ pdf-gold-digger
|
||||
====
|
||||
|
||||
Pdf information extraction library based on [pdf.js](https://mozilla.github.io/pdf.js/)
|
||||
and [node.js](https://nodejs.org).
|
||||
and [node.js](https://nodejs.org) with various output formats.
|
||||
|
||||
![GitHub](https://img.shields.io/github/license/vane/pdf-gold-digger)
|
||||
![npm](https://img.shields.io/npm/v/pdf-gold-digger)
|
||||
@ -27,8 +27,8 @@ ex. pdfdig -i input-file -o output_directory -f json
|
||||
--input or -i pdf file location (required)
|
||||
--output or -o pdf file location (optional default "out")
|
||||
--debug or -d show debug information (optional - default "false")
|
||||
--format or -f format (optional - default "text") - ("text,json,xml")
|
||||
--font or -t extract fonts as ttf files
|
||||
--format or -f format (optional - default "text") - ("text,json,xml,html")
|
||||
--font or -t extract fonts as ttf files (optional)
|
||||
--help or -h display this help message
|
||||
```
|
||||
|
||||
@ -52,12 +52,12 @@ and see results in ```out``` directory
|
||||
- text ```-f text (default)```
|
||||
- json ```-f json```
|
||||
- xml ```-f xml```
|
||||
- html ```-f html```
|
||||
- specify output directory
|
||||
|
||||
## TODO:
|
||||
- load pdf from remote location
|
||||
- from url
|
||||
- output to html format
|
||||
- output to markdown format
|
||||
- output to zip
|
||||
- extract tables
|
||||
|
2
gd.js
2
gd.js
@ -3,7 +3,7 @@ const GoldDigger = require('./src/GoldDigger');
|
||||
const FileManager = require('./src/pdf/FileManager');
|
||||
|
||||
|
||||
const supportedFormat = ['text', 'json', 'xml'];
|
||||
const supportedFormat = ['text', 'json', 'xml', 'html'];
|
||||
const ERR_INVALID_FORMAT = `
|
||||
Invalid output
|
||||
Please specify one of those values : "${supportedFormat}"
|
||||
|
@ -11,12 +11,13 @@ class Formatter {
|
||||
constructor (config) {
|
||||
this.debug = config.debug;
|
||||
/**
|
||||
* @type {{json: FormatterJSON, xml: FormatterXML, text: FormatterText}}
|
||||
* @type {{json: FormatterJSON, xml: FormatterXML, text: FormatterText, html: FormatterHTML}}
|
||||
*/
|
||||
this.formatters = {
|
||||
json: new f.FormatterJSON(),
|
||||
xml: new f.FormatterXML(),
|
||||
text: new f.FormatterText(),
|
||||
html: new f.FormatterHTML(),
|
||||
};
|
||||
this.data = '';
|
||||
}
|
||||
|
114
src/pdf/formatters/FormatterHTML.js
Normal file
114
src/pdf/formatters/FormatterHTML.js
Normal file
@ -0,0 +1,114 @@
|
||||
const Model = require('../model');
|
||||
|
||||
/**
|
||||
* Format PDF into html data
|
||||
*/
|
||||
class FormatterHTML {
|
||||
/**
|
||||
* See {@link Formatter}
|
||||
* @param doc
|
||||
* @param metadata
|
||||
* @returns {string}
|
||||
*/
|
||||
start (doc, metadata) {
|
||||
return `<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Title</title>
|
||||
</head>
|
||||
<body>
|
||||
`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats {@link TextObject} to xml object
|
||||
* @param {TextObject} textObject
|
||||
* @returns {object}
|
||||
*/
|
||||
formatTextObject (textObject) {
|
||||
let txtObjOut = `<div>\n`;
|
||||
const lines = textObject.getData();
|
||||
lines.sort((a, b) => {
|
||||
if (a.y > b.y) return -1;
|
||||
if (a.y < b.y) return 1;
|
||||
return 0;
|
||||
});
|
||||
lines.forEach(textLine => {
|
||||
txtObjOut += this.formatTextLine(textLine);
|
||||
});
|
||||
txtObjOut += '</div>\n';
|
||||
return txtObjOut;
|
||||
}
|
||||
|
||||
/**
|
||||
* Format image object
|
||||
* @param {ImageObject} imageObject
|
||||
*/
|
||||
formatImageObject (imageObject) {
|
||||
return `<img class="pdf-dig-img"
|
||||
width="${imageObject.width}px"
|
||||
height="${imageObject.height}px"
|
||||
src="img/${imageObject.name}"/>\n`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats {@link TextLine} to xml object
|
||||
* @param {TextLine} textLine
|
||||
* @returns {object}
|
||||
*/
|
||||
formatTextLine (textLine) {
|
||||
let txtLineOut = `<p class="pdfdig-text-line">`;
|
||||
textLine.getText().forEach(textFont => {
|
||||
txtLineOut += this.formatTextFont(textFont);
|
||||
});
|
||||
txtLineOut += '</p>\n';
|
||||
return txtLineOut;
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats {@link TextFont} to xml object
|
||||
* @param {TextFont} textFont
|
||||
* @returns {object}
|
||||
*/
|
||||
formatTextFont (textFont) {
|
||||
return `<span class="pdfdig-text-font" style="font-family: ${textFont.font.family};
|
||||
font-size:${textFont.font.size}pt;
|
||||
font-weight:${textFont.font.weight};">
|
||||
${textFont.getText()}
|
||||
</span>`;
|
||||
}
|
||||
|
||||
/**
|
||||
* See {@link Formatter}
|
||||
* @param page
|
||||
* @param data
|
||||
* @param last
|
||||
* @returns {string}
|
||||
*/
|
||||
format (page, data, last) {
|
||||
let out = '<div class="pdfdig-pdf-object">\n';
|
||||
data.forEach(pdfObject => {
|
||||
if (pdfObject instanceof Model.TextObject) {
|
||||
out += this.formatTextObject(pdfObject);
|
||||
} else if (pdfObject instanceof Model.ImageObject) {
|
||||
out += this.formatImageObject(pdfObject);
|
||||
} else {
|
||||
console.warn(`Not recognised object ${pdfObject}`);
|
||||
}
|
||||
});
|
||||
out += '</div>';
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* See {@link Formatter}
|
||||
* @returns {string}
|
||||
*/
|
||||
end () {
|
||||
return `</body>
|
||||
</html>`;
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = FormatterHTML;
|
@ -1,9 +1,11 @@
|
||||
const FormatterJSON = require('./FormatterJSON');
|
||||
const FormatterText = require('./FormatterText');
|
||||
const FormatterXML = require('./FormatterXML');
|
||||
const FormatterHTML = require('./FormatterHTML');
|
||||
|
||||
module.exports = {
|
||||
FormatterXML,
|
||||
FormatterJSON,
|
||||
FormatterText,
|
||||
FormatterHTML,
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user