Rewrite text extraction - calculate new line Closes #11

This commit is contained in:
Michal Szczepanski 2019-07-25 23:00:04 +02:00
parent f5441748bf
commit 877c14a6e1
4 changed files with 47 additions and 119 deletions

@ -13,25 +13,29 @@ class ExtractText {
*/
showText(glyphs, page) {
// MOVED from VisitorText
const el = page.currentObject.getLine();
let lineList = page.currentObject.getLine();
// -i ../../github.com/pdf.js/test/pdfs/ZapfDingbats.pdf -f text null pointer
if(!el.getText()) {
el.newText();
const line = new Model.TextFont();
line.font = page.currentFont;
// copy from previous line
const lastLine = lineList.getLastFontText()
if(lastLine) {
line.wordSpacing = lastLine.wordSpacing;
line.charSpacing = lastLine.charSpacing;
}
el.setFont(page.currentFont)
const line = el.getText();
let startX = page.x;
let startY = page.y;
// END
let partial = "";
let x = 0;
const font = line.getFont();
for(const glyph of glyphs) {
if (glyph === null) {
// Word break
x += font.direction * line.wordSpacing;
continue;
} else if (util.isNum(glyph)) {
x += -glyph * font.size * 0.001;
if (glyph === -250) {
x += -glyph * line.font.size * 0.001;
if (glyph <= -150) {
partial += " ";
}
continue;
@ -44,25 +48,30 @@ class ExtractText {
partial += glyph.unicode;
const width = glyph.width;
// const widthAdvanceScale = font.size * line.fontMatrix[0];
const widthAdvanceScale = font.size * Constraints.FONT_IDENTITY_MATRIX[0];
const charWidth = width * widthAdvanceScale + spacing * font.direction;
if (!glyph.isInFont && !font.missingFile) {
const widthAdvanceScale = line.font.size * Constraints.FONT_IDENTITY_MATRIX[0];
const charWidth = width * widthAdvanceScale + spacing * line.font.direction;
if (!glyph.isInFont && !line.font.missingFile) {
x += charWidth;
continue;
}
//need global x/y position
/*current.xcoords.push(current.x + x * textHScale);
current.tspan.textContent += character;
*/
page.currentObject.x += charWidth;
if (font.vertical) {
page.currentObject.y -= x * textHScale;
line.x = page.x += charWidth;
if (line.font.vertical) {
page.y -= x * page.textHScale;
} else {
page.currentObject.x += x * textHScale;
page.x += x * page.textHScale;
}
}
line.x = page.x;
line.y = page.y;
line.setText(partial+" ");
el.printText();
const isNew = lineList.y !== 0 && Math.abs(line.y - lineList.y) > line.font.size
if(isNew) {
lineList.printText()
lineList = page.currentObject.newLine()
}
lineList.x = startX;
lineList.y = startY;
lineList.addTextFont(line);
}
/**

@ -6,28 +6,12 @@ const PdfObject = require('./../PdfObject');
class TextFont extends PdfObject {
constructor() {
super();
this._font = null;
this.font = null;
this._text = "";
this.charSpacing = 0;
this.wordSpacing = 0;
}
/**
* Getter for {FontObject}
* @returns {null|FontObject}
*/
getFont() {
return this._font;
}
/**
* Setter for {FontObject}
* @param font {FontObject}
*/
setFont(font) {
this._font = font;
}
/**
* Setter for text string
* @param text {string}

@ -8,82 +8,28 @@ class TextLine extends PdfObject {
constructor() {
super();
this._textFonts = [];
this.maxFontSize = 0;
this.w = 0;
this.h = 0;
this.textMatrix = null;
this.width = 0;
this.height = 0;
}
/**
* Calculate TextLine size
* @param x
* @param y
* Adds line with font to text
* @param line @see {TextFont}
*/
setBBox(x, y) {
if(this.x === 0) {
this.x = x;
}
this.w = Math.max(this.x, x);
if(this.y === 0) {
this.y = y;
}
this.h = Math.max(this.y, y);
addTextFont(line) {
this._textFonts.push(line);
}
/**
* Get last TextFont
* @returns {TextFont}
*/
getText() {
return this._textFonts[this._textFonts.length - 1]
getLastFontText() {
if(this._textFonts.length > 0) {
return this._textFonts[this._textFonts.length - 1]
}
return null
}
/**
* Create new TextFont and add it to list
* @returns {TextFont}
*/
newText() {
const t = new TextFont();
this._textFonts.push(t);
return t;
}
/**
* Remove last TextFont
*/
popText() {
this._textFonts.pop();
}
/**
* Checks if y is greater then maximal font added to this line
* if so return true so it's new line of text
* @param y - position in pdf document
* @returns {boolean}
*/
isNewLine(y) {
return this.y !== 0 && Math.abs(y) > this.maxFontSize
}
/**
* Set FontObject information in TextFont
* @param font {FontObject} data
*/
setFont(font) {
this.maxFontSize = Math.max(this.maxFontSize, font.size);
this.getText().setFont(font);
}
/**
* Compare if two FontObject are equal
* @param font1 {FontObject}
* @param font2 {FontObject}
* @returns {Boolean} true if equal
*/
compareFont(font1, font2) {
return font1.equals(font2);
}
/**
* Output line text to the console
*/

@ -77,24 +77,13 @@ class VisitorText extends VisitorBase {
moveText(args) {
if (this.config.debug) console.log('moveText');
if (this.config.skip) return;
/*
let el = this.page.currentObject.getLine();
const x = args[0], y = args[1];
const newLine = el.isNewLine(y);
// new line
if(newLine) {
el.printText();
el = this.page.currentObject.newLine();
if(this.page.x === 0 && this.page.y === 0) {
this.page.x = args[0];
this.page.y = args[1];
} else {
this.page.x = args[0]
this.page.y += args[1]
}
// create new text element always after new line
const el2 = el.newText();
el2.x = this.page.currentObject.x += x;
el2.y = this.page.currentObject.y += y;
// assign to calculate bounding box
el.setBBox(this.page.currentObject.x, this.page.currentObject.y);
*/
this.page.currentObject.x = args[0];
this.page.currentObject.y = args[1];
}
/**