scantools  1.0.7
Graphics manipulation with a view towards scanned documents
PDFAWriter.h
1 /*
2  * Copyright © 2016 - 2020 Stefan Kebekus <stefan.kebekus@math.uni-freiburg.de>
3  *
4  * This program is free software: you can redistribute it and/or modify it under
5  * the terms of the GNU General Public License as published by the Free Software
6  * Foundation, either version 3 of the License, or (at your option) any later
7  * version.
8  *
9  * This program is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 
19 #ifndef PDFDOCUMENT
20 #define PDFDOCUMENT 1
21 
22 #include <QFuture>
23 #include <QList>
24 #include <QReadWriteLock>
25 #include <QString>
26 
27 #include "HOCRDocument.h"
28 #include "JBIG2Document.h"
29 #include "paperSize.h"
30 #include "resolution.h"
31 
32 
127 class PDFAWriter : public QObject
128 {
129  Q_OBJECT
130  Q_PROPERTY(QString author READ author WRITE setAuthor NOTIFY authorChanged)
131  Q_PROPERTY(QString keywords READ keywords WRITE setKeywords NOTIFY keywordsChanged)
132  Q_PROPERTY(QString subject READ subject WRITE setSubject NOTIFY subjectChanged)
133  Q_PROPERTY(QString title READ title WRITE setTitle NOTIFY titleChanged)
134  Q_PROPERTY(paperSize pageSize READ pageSize WRITE setPageSize NOTIFY pageSizeChanged)
135  Q_PROPERTY(resolution resolutionOverrideHorizontal READ resolutionOverrideHorizontal WRITE setResolutionOverrideHorizontal NOTIFY resolutionOverrideHorizontalChanged)
136  Q_PROPERTY(resolution resolutionOverrideVertical READ resolutionOverrideVertical WRITE setResolutionOverrideVertical NOTIFY resolutionOverrideVerticalChanged)
137  Q_PROPERTY(bool autoOCR READ autoOCR WRITE setAutoOCR NOTIFY autoOCRChanged)
138  Q_PROPERTY(QStringList autoOCRLanguages READ autoOCRLanguages WRITE setAutoOCRLanguages NOTIFY autoOCRLanguagesChanged)
139 
140  public:
147 
170  explicit PDFAWriter(bool bestCompression=false, QObject* parent=nullptr);
171 
175  QString author();
176 
181  void setAuthor(const QString &author);
182 
186  QString keywords();
187 
192  void setKeywords(const QString &keywords);
193 
198  QString subject();
199 
204  void setSubject(const QString &subject);
205 
209  QString title();
210 
215  void setTitle(const QString &title);
216 
222 
227  void setPageSize(const paperSize size);
228 
234 
240 
248 
254 
262 
276  void setResolutionOverride(resolution horizontal, resolution vertical);
277 
283  {
284  setResolutionOverride(res, res);
285  }
286 
289  {
291  }
292 
296  bool autoOCR();
297 
307  void setAutoOCR(bool autoOCR);
308 
313  QStringList autoOCRLanguages();
314 
332  QString setAutoOCRLanguages(const QStringList& nOCRLanguages);
333 
349  void appendToOCRData(const HOCRDocument &doc);
350 
358 
363  void clearOCRData();
364 
398  QString addPages(const QImage &image, QStringList *warnings=0);
399 
419  QString addPages(const JBIG2Document &jbig2doc, QStringList *warnings=0);
420 
489  QString addPages(const QString &imageFileName, QStringList *warnings=0);
490 
502  operator QByteArray();
503 
504  public slots:
517 
518  signals:
521 
524 
527 
529  void titleChanged();
530 
533 
536 
539 
542 
545 
553  void finished();
554 
567  void progress(qreal percentage);
568 
569  private:
570  // Meta data
571  QString _author, _keywords, _subject, _title;
572 
573  // Paper size
574  paperSize _pageSize;
575 
576  // HOCR Document
577  HOCRDocument userSpecifiedOCRData;
578  QStringList OCRLanguages;
579  bool _autoOCR;
580 
581  // Override resolutions
582  resolution horizontalResolutionOverride;
583  resolution verticalResolutionOverride;
584 
585  // This private method adds a JBIG2 image to the PDF document. It differs from
586  // the generic method addPages() only in the arguments it expects the name of
587  // a JBIG file inestead of an abitrary graphics file.
588  //
589  // The image will be embedded in the PDF without re-encoding. The method does
590  // not check in detail if the file complies with the JBIG2 standard. If
591  // invalid input data is fed into this method, then the resulting PDF file
592  // might possibly not comply to the PDF/A standard.
593  QString addJBIG2(const QString &fileName, QStringList *warnings=0);
594 
595  // This private method adds a JPEG image to the PDF document. It differs from
596  // the generic method addPages() only in the arguments it expects the name of
597  // a JPEG file inestead of an abitrary graphics file.
598  //
599  // The image will be embedded in the PDF without re-encoding. The method does
600  // not check in detail if the file complies with the JPEG standard. If
601  // invalid input data is fed into this method, then the resulting PDF file
602  // might possibly not comply to the PDF/A standard.
603  QString addJPEG(const QString &fileName);
604 
605  // This private method adds a JPEG2000 (ISO/IEC 15444-2) image to the PDF
606  // document. The method expects a JPX or JPF file, and NOT a JP2 file. It
607  // differs from the generic method addPages() only in the arguments. It
608  // expects the name of a JPEG2000 file inestead of an abitrary graphics file.
609  //
610  // The image will be embedded in the PDF without re-encoding. The method does
611  // not check in detail if the file complies with the JPEG standard. If
612  // invalid input data is fed into this method, then the resulting PDF file
613  // might possibly not comply to the PDF/A standard.
614  QString addJPX(const QString &fileName);
615 
616  // This private method adds a TIFF image to the PDF document. The method
617  // exists because QImageReader cannot handle multi-page TIFF files. The method
618  // reads all images contained in the file, and calls addImage() to add them to
619  // the PDF
620  QString addTIFF(const QString &fileName);
621 
622  // This private method is used internally to generate a page containing a
623  // given graphicObject, and optionally a text overlay. This method assumes
624  // that the arguments have been checked and are correct. It also assumes that
625  // the PDFAWriter has been locked for writing.
626  void addGFXPage(quint32 graphicObjectIndex, const imageInfo& bInfo, const QImage& imageForOCR = QImage());
627 
628  // Lock used to provide thread-safety
629  QReadWriteLock lock;
630 
631  // PDF protoObject. This is either a QByteArray or QFuture<QByteArray>.
632  class protoObject {
633  public:
634  // cppcheck-suppress noExplicitConstructor
635  protoObject(QByteArray _data) : data(_data) {
636  ;
637  };
638 
639  // cppcheck-suppress noExplicitConstructor
640  protoObject(QFuture<QByteArray> _future) : future(_future) {
641  ;
642  };
643 
644  inline operator QByteArray() {
645  if (!future.isCanceled()) {
646  data = future.result();
647  future = QFuture<QByteArray>();
648  }
649  return data;
650  };
651 
652  QString description;
653  QByteArray data;
654  QFuture<QByteArray> future;
655  };
656 
657  // List of PDF objects
658  QList<protoObject> objects;
659 
660  // Index of the PDF object in the 'objects' list that contains …
661  quint32 catalogObjectIndex; // … the catalog of the PDF file
662  quint32 metaDataObjectIndex; // … the meta data
663  quint32 infoObjectIndex; // … the info object
664  quint32 pageDirectoryObjectIndex; // … the page directory
665  quint32 colorProfileObjectIndex; // … the color profile
666  quint32 fontObjectIndex; // … the font object itself
667 
668  // Use zopfli compression for bitmap graphics
669  bool bestCompression;
670 
671  // Indices of the PDF page objects in the 'objects' list
672  QList<quint32> pageIndices;
673 
674  // Reads file content into QByteArray
675  static QByteArray readFile(const QString& fileName);
676 
677  // Constructs a page directory object
678  QByteArray generatePageDirectoryObject() const;
679 
680  // Takes data from input, checks is zlib compression actually shrinks the
681  // data, and then generates a stream object, either unencoded or zlib encoded.
682  static QByteArray generateStreamObject(const QByteArray &input);
683 
684  // Returns the index of a font object for Times-Roman. Creates the object, if necessary
685  quint32 getFontObjectIndex();
686 
687  // Assumes that the image is black-and-white, as returned by
688  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
689  // PDF object containing the FAX G4 compressed image.
690  static QByteArray createImageObject_bw_G4(const QImage &image);
691 
692  // Assumes that the image is bitonal, as returned by
693  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
694  // PDF object containing the FAX G4 compressed image.
695  static QByteArray createImageObject_bitonal_G4(const QImage &image);
696 
697  // Assumes that the image is grayscale, as returned by
698  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
699  // PDF object containing the zlib/zopfli compressed image.
700  static QByteArray createImageObject_gray_zlib(const QImage &image, bool bestCompression);
701 
702  // Assumes that the image has an indexed palette, as returned by
703  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
704  // PDF object containing the zlib/zopfli compressed image.
705  static QByteArray createImageObject_indexed_zlib(const QImage &image, bool bestCompression);
706 
707  // Assumes that the image is full color, as returned by
708  // imageOperations::optimizedFormat(), and returns a QByteArray containing a
709  // PDF object containing the zlib/zopfli compressed image.
710  static QByteArray createImageObject_rgb_zlib(const QImage &image, bool bestCompression);
711 
712  // Internal method. The method takes a page content stream and generates a
713  // well-compressed pageContent object, using the textBox to create a text
714  // overlay.
715  static QByteArray completePageContentObject_a(QByteArray contentStream, const imageInfo& bInfo, length deltaX, length deltaY, const HOCRTextBox& textBox);
716 
717  // Internal method. The method takes runs the tesseract OCR engine to create a
718  // HOCRTextBox and then calls completePageContentObject_a
719  static QByteArray completePageContentObject_b(QByteArray contentStream, const imageInfo& bInfo, length deltaX, length deltaY, const QImage& image, const QStringList& OCRLanguages);
720 };
721 
722 #endif
PDFAWriter::setResolutionOverrideVertical
void setResolutionOverrideVertical(resolution vertical)
Set vertical resolution.
PDFAWriter::subjectChanged
void subjectChanged()
Emitted when subject changes.
PDFAWriter::addPages
QString addPages(const QString &imageFileName, QStringList *warnings=0)
Add images to the PDF document.
imageInfo
Trivial class to store elementary info about bitmap graphics.
Definition: imageInfo.h:31
PDFAWriter::keywords
QString keywords()
Metadata: Keywords.
PDFAWriter
Simple generator for PDF/A-2b compliant documents.
Definition: PDFAWriter.h:128
PDFAWriter::autoOCRLanguages
QStringList autoOCRLanguages()
List of languages used for OCR.
paperSize::format
format
List of supported standard sizes.
Definition: paperSize.h:35
length
The length stores a length and converts between units.
Definition: length.h:38
PDFAWriter::setPageSize
void setPageSize(paperSize::format size=paperSize::empty)
Sets page size, effective for future calls of the methods addPage()
PDFAWriter::pageSizeChanged
void pageSizeChanged()
Emitted when pageSize changes.
PDFAWriter::setAutoOCRLanguages
QString setAutoOCRLanguages(const QStringList &nOCRLanguages)
Specify languages used by the tesseract OCR engine.
PDFAWriter::title
QString title()
Metadata: Title String.
PDFAWriter::PDFAWriter
PDFAWriter(bool bestCompression=false, QObject *parent=nullptr)
Constructor.
PDFAWriter::addPages
QString addPages(const QImage &image, QStringList *warnings=0)
Add an image to the PDF document.
PDFAWriter::OCRData
HOCRDocument OCRData()
Return a copy of the internal HOCRDocument.
PDFAWriter::addPages
QString addPages(const JBIG2Document &jbig2doc, QStringList *warnings=0)
Add JBIG2 images to the PDF document.
PDFAWriter::subject
QString subject()
Metadata: Subject string.
PDFAWriter::resolutionOverrideVertical
resolution resolutionOverrideVertical()
Vertical resolution.
PDFAWriter::setAuthor
void setAuthor(const QString &author)
Set the author string in the PDF/A meta data.
PDFAWriter::setAutoOCR
void setAutoOCR(bool autoOCR)
Specify if the tesseract OCR engine should be run automatically.
PDFAWriter::finished
void finished()
Emitted just before waitForWorkerThreads() returns.
PDFAWriter::autoOCRChanged
void autoOCRChanged()
Emitted when autoOCR changes.
PDFAWriter::appendToOCRData
void appendToOCRData(const HOCRDocument &doc)
Specify pre-processed OCR data.
HOCRTextBox
Text box, as defined in an HOCR file.
Definition: HOCRTextBox.h:45
PDFAWriter::resolutionOverrideHorizontalChanged
void resolutionOverrideHorizontalChanged()
Emitted when resolutionOverrideHorizontal changes.
PDFAWriter::setPageSize
void setPageSize(const paperSize size)
Sets page size, effective for future calls of the methods addPage()
PDFAWriter::keywordsChanged
void keywordsChanged()
Emitted when keywords change.
paperSize
The paperSize class identifies and stores paper sizes.
Definition: paperSize.h:32
PDFAWriter::setResolutionOverride
void setResolutionOverride(resolution horizontal, resolution vertical)
Sets graphic resolution for future calls of the methods addPage()
PDFAWriter::progress
void progress(qreal percentage)
Progress indicator.
resolution
The resolution class stores a resolution and converts between units.
Definition: resolution.h:40
PDFAWriter::~PDFAWriter
~PDFAWriter()
Destructor.
PDFAWriter::setSubject
void setSubject(const QString &subject)
Set the subject string in the PDF/A meta data.
PDFAWriter::setResolutionOverride
void setResolutionOverride(resolution res)
Overloaded method that sets horizontal and vertical resolution to the same value.
Definition: PDFAWriter.h:282
PDFAWriter::autoOCRLanguagesChanged
void autoOCRLanguagesChanged()
Emitted when autoOCRLanguages change.
PDFAWriter::resolutionOverrideVerticalChanged
void resolutionOverrideVerticalChanged()
Emitted when resolutionOverrideVertical changes.
PDFAWriter::titleChanged
void titleChanged()
Emitted when title changes.
PDFAWriter::setKeywords
void setKeywords(const QString &keywords)
Set the author string in the PDF/A meta data.
JBIG2Document
Reads, writes and renders JBIG2 files, and chops them into pieces for inclusion into a PDF document.
Definition: JBIG2Document.h:40
PDFAWriter::authorChanged
void authorChanged()
Emitted when author changes.
PDFAWriter::author
QString author()
Metadata: Author.
PDFAWriter::setResolutionOverrideHorizontal
void setResolutionOverrideHorizontal(resolution horizontal)
Set horizontal resolution.
HOCRDocument
Reads and interprets HOCR files, the standard output file format for Optical Character Recognition sy...
Definition: HOCRDocument.h:42
PDFAWriter::waitForWorkerThreads
void waitForWorkerThreads()
Waits for all worker threads to finish.
PDFAWriter::resolutionOverrideHorizontal
resolution resolutionOverrideHorizontal()
Horizontal resolution.
paperSize::empty
@ empty
0x0mm
Definition: paperSize.h:38
PDFAWriter::pageSize
paperSize pageSize()
Page Size.
PDFAWriter::clearOCRData
void clearOCRData()
Delete all pages from the internal HOCRDocument.
PDFAWriter::clearResolutionOverride
void clearResolutionOverride()
Set horizontal and vertical override resolution to zero.
Definition: PDFAWriter.h:288
PDFAWriter::setTitle
void setTitle(const QString &title)
Set the title string in the PDF/A meta data.
PDFAWriter::autoOCR
bool autoOCR()
AutoOCR.