Rewrite text extraction - calculate new line Closes #11
This commit is contained in:
parent
f5441748bf
commit
877c14a6e1
@ -13,25 +13,29 @@ class ExtractText {
|
||||
*/
|
||||
showText(glyphs, page) {
|
||||
// MOVED from VisitorText
|
||||
const el = page.currentObject.getLine();
|
||||
let lineList = page.currentObject.getLine();
|
||||
// -i ../../github.com/pdf.js/test/pdfs/ZapfDingbats.pdf -f text null pointer
|
||||
if(!el.getText()) {
|
||||
el.newText();
|
||||
const line = new Model.TextFont();
|
||||
line.font = page.currentFont;
|
||||
// copy from previous line
|
||||
const lastLine = lineList.getLastFontText()
|
||||
if(lastLine) {
|
||||
line.wordSpacing = lastLine.wordSpacing;
|
||||
line.charSpacing = lastLine.charSpacing;
|
||||
}
|
||||
el.setFont(page.currentFont)
|
||||
const line = el.getText();
|
||||
let startX = page.x;
|
||||
let startY = page.y;
|
||||
// END
|
||||
let partial = "";
|
||||
let x = 0;
|
||||
const font = line.getFont();
|
||||
for(const glyph of glyphs) {
|
||||
if (glyph === null) {
|
||||
// Word break
|
||||
x += font.direction * line.wordSpacing;
|
||||
continue;
|
||||
} else if (util.isNum(glyph)) {
|
||||
x += -glyph * font.size * 0.001;
|
||||
if (glyph === -250) {
|
||||
x += -glyph * line.font.size * 0.001;
|
||||
if (glyph <= -150) {
|
||||
partial += " ";
|
||||
}
|
||||
continue;
|
||||
@ -44,25 +48,30 @@ class ExtractText {
|
||||
partial += glyph.unicode;
|
||||
const width = glyph.width;
|
||||
// const widthAdvanceScale = font.size * line.fontMatrix[0];
|
||||
const widthAdvanceScale = font.size * Constraints.FONT_IDENTITY_MATRIX[0];
|
||||
const charWidth = width * widthAdvanceScale + spacing * font.direction;
|
||||
if (!glyph.isInFont && !font.missingFile) {
|
||||
const widthAdvanceScale = line.font.size * Constraints.FONT_IDENTITY_MATRIX[0];
|
||||
const charWidth = width * widthAdvanceScale + spacing * line.font.direction;
|
||||
if (!glyph.isInFont && !line.font.missingFile) {
|
||||
x += charWidth;
|
||||
continue;
|
||||
}
|
||||
//need global x/y position
|
||||
/*current.xcoords.push(current.x + x * textHScale);
|
||||
current.tspan.textContent += character;
|
||||
*/
|
||||
page.currentObject.x += charWidth;
|
||||
if (font.vertical) {
|
||||
page.currentObject.y -= x * textHScale;
|
||||
line.x = page.x += charWidth;
|
||||
if (line.font.vertical) {
|
||||
page.y -= x * page.textHScale;
|
||||
} else {
|
||||
page.currentObject.x += x * textHScale;
|
||||
page.x += x * page.textHScale;
|
||||
}
|
||||
}
|
||||
line.x = page.x;
|
||||
line.y = page.y;
|
||||
line.setText(partial+" ");
|
||||
el.printText();
|
||||
const isNew = lineList.y !== 0 && Math.abs(line.y - lineList.y) > line.font.size
|
||||
if(isNew) {
|
||||
lineList.printText()
|
||||
lineList = page.currentObject.newLine()
|
||||
}
|
||||
lineList.x = startX;
|
||||
lineList.y = startY;
|
||||
lineList.addTextFont(line);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -6,28 +6,12 @@ const PdfObject = require('./../PdfObject');
|
||||
class TextFont extends PdfObject {
|
||||
constructor() {
|
||||
super();
|
||||
this._font = null;
|
||||
this.font = null;
|
||||
this._text = "";
|
||||
this.charSpacing = 0;
|
||||
this.wordSpacing = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Getter for {FontObject}
|
||||
* @returns {null|FontObject}
|
||||
*/
|
||||
getFont() {
|
||||
return this._font;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter for {FontObject}
|
||||
* @param font {FontObject}
|
||||
*/
|
||||
setFont(font) {
|
||||
this._font = font;
|
||||
}
|
||||
|
||||
/**
|
||||
* Setter for text string
|
||||
* @param text {string}
|
||||
|
@ -8,82 +8,28 @@ class TextLine extends PdfObject {
|
||||
constructor() {
|
||||
super();
|
||||
this._textFonts = [];
|
||||
this.maxFontSize = 0;
|
||||
this.w = 0;
|
||||
this.h = 0;
|
||||
this.textMatrix = null;
|
||||
this.width = 0;
|
||||
this.height = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate TextLine size
|
||||
* @param x
|
||||
* @param y
|
||||
* Adds line with font to text
|
||||
* @param line @see {TextFont}
|
||||
*/
|
||||
setBBox(x, y) {
|
||||
if(this.x === 0) {
|
||||
this.x = x;
|
||||
}
|
||||
this.w = Math.max(this.x, x);
|
||||
if(this.y === 0) {
|
||||
this.y = y;
|
||||
}
|
||||
this.h = Math.max(this.y, y);
|
||||
addTextFont(line) {
|
||||
this._textFonts.push(line);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get last TextFont
|
||||
* @returns {TextFont}
|
||||
*/
|
||||
getText() {
|
||||
return this._textFonts[this._textFonts.length - 1]
|
||||
getLastFontText() {
|
||||
if(this._textFonts.length > 0) {
|
||||
return this._textFonts[this._textFonts.length - 1]
|
||||
}
|
||||
return null
|
||||
}
|
||||
|
||||
/**
|
||||
* Create new TextFont and add it to list
|
||||
* @returns {TextFont}
|
||||
*/
|
||||
newText() {
|
||||
const t = new TextFont();
|
||||
this._textFonts.push(t);
|
||||
return t;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove last TextFont
|
||||
*/
|
||||
popText() {
|
||||
this._textFonts.pop();
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if y is greater then maximal font added to this line
|
||||
* if so return true so it's new line of text
|
||||
* @param y - position in pdf document
|
||||
* @returns {boolean}
|
||||
*/
|
||||
isNewLine(y) {
|
||||
return this.y !== 0 && Math.abs(y) > this.maxFontSize
|
||||
}
|
||||
|
||||
/**
|
||||
* Set FontObject information in TextFont
|
||||
* @param font {FontObject} data
|
||||
*/
|
||||
setFont(font) {
|
||||
this.maxFontSize = Math.max(this.maxFontSize, font.size);
|
||||
this.getText().setFont(font);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare if two FontObject are equal
|
||||
* @param font1 {FontObject}
|
||||
* @param font2 {FontObject}
|
||||
* @returns {Boolean} true if equal
|
||||
*/
|
||||
compareFont(font1, font2) {
|
||||
return font1.equals(font2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Output line text to the console
|
||||
*/
|
||||
|
@ -77,24 +77,13 @@ class VisitorText extends VisitorBase {
|
||||
moveText(args) {
|
||||
if (this.config.debug) console.log('moveText');
|
||||
if (this.config.skip) return;
|
||||
/*
|
||||
let el = this.page.currentObject.getLine();
|
||||
const x = args[0], y = args[1];
|
||||
const newLine = el.isNewLine(y);
|
||||
// new line
|
||||
if(newLine) {
|
||||
el.printText();
|
||||
el = this.page.currentObject.newLine();
|
||||
if(this.page.x === 0 && this.page.y === 0) {
|
||||
this.page.x = args[0];
|
||||
this.page.y = args[1];
|
||||
} else {
|
||||
this.page.x = args[0]
|
||||
this.page.y += args[1]
|
||||
}
|
||||
// create new text element always after new line
|
||||
const el2 = el.newText();
|
||||
el2.x = this.page.currentObject.x += x;
|
||||
el2.y = this.page.currentObject.y += y;
|
||||
// assign to calculate bounding box
|
||||
el.setBBox(this.page.currentObject.x, this.page.currentObject.y);
|
||||
*/
|
||||
this.page.currentObject.x = args[0];
|
||||
this.page.currentObject.y = args[1];
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
Reference in New Issue
Block a user