BibleTime
gbftohtml.cpp
Go to the documentation of this file.
1 /*********
2 *
3 * In the name of the Father, and of the Son, and of the Holy Spirit.
4 *
5 * This file is part of BibleTime's source code, https://bibletime.info/
6 *
7 * Copyright 1999-2021 by the BibleTime developers.
8 * The BibleTime source code is licensed under the GNU General Public License
9 * version 2.0.
10 *
11 **********/
12 
13 #include "gbftohtml.h"
14 
15 #include <cstdlib>
16 #include <cstring>
17 #include <QByteArray>
18 #include <QChar>
19 #include <QRegularExpression>
20 #include <QRegularExpressionMatch>
21 #include <QString>
22 #include <QStringList>
23 #include <utility>
24 #include "../../util/btassert.h"
25 #include "../drivers/cswordmoduleinfo.h"
26 #include "../managers/cswordbackend.h"
27 
28 // Sword includes:
29 #include <gbfhtml.h>
30 #include <swbasicfilter.h>
31 #include <swbuf.h>
32 #include <swkey.h>
33 #include <swmodule.h>
34 
35 
37 
38  setEscapeStringCaseSensitive(true);
39  setPassThruUnknownEscapeString(true); //the HTML widget will render the HTML escape codes
40 
41  removeTokenSubstitute("Rf");
42  // addTokenSubstitute("RB", "<span>"); //start of a footnote with embedded text
43 
44  addTokenSubstitute("FI", "<span class=\"italic\">"); // italics begin
45  addTokenSubstitute("Fi", "</span>");
46 
47  addTokenSubstitute("FB", "<span class=\"bold\">"); // bold begin
48  addTokenSubstitute("Fb", "</span>");
49 
50  addTokenSubstitute("FR", "<span class=\"jesuswords\">");
51  addTokenSubstitute("Fr", "</span>");
52 
53  addTokenSubstitute("FU", "<u>"); // underline begin
54  addTokenSubstitute("Fu", "</u>");
55 
56  addTokenSubstitute("FO", "<span class=\"quotation\">"); // Old Testament quote begin
57  addTokenSubstitute("Fo", "</span>");
58 
59 
60  addTokenSubstitute("FS", "<span class=\"sup\">"); // Superscript begin// Subscript begin
61  addTokenSubstitute("Fs", "</span>");
62 
63  addTokenSubstitute("FV", "<span class=\"sub\">"); // Subscript begin
64  addTokenSubstitute("Fv", "</span>");
65 
66  addTokenSubstitute("TT", "<div class=\"booktitle\">");
67  addTokenSubstitute("Tt", "</div>");
68 
69  addTokenSubstitute("TS", "<div class=\"sectiontitle\">");
70  addTokenSubstitute("Ts", "</div>");
71 
72  //addTokenSubstitute("PP", "<span class=\"poetry\">"); // poetry begin
73  //addTokenSubstitute("Pp", "</span>");
74 
75 
76  addTokenSubstitute("Fn", "</font>"); // font end
77  addTokenSubstitute("CL", "<br/>"); // new line
78  addTokenSubstitute("CM", "<br/>"); // paragraph <!P> is a non showing comment that can be changed in the front end to <P> if desired
79 
80  addTokenSubstitute("CG", "&gt;"); // literal greater-than sign
81  addTokenSubstitute("CT", "&lt;"); // literal less-than sign
82 
83  addTokenSubstitute("JR", "<span class=\"right\">"); // right align begin
84  addTokenSubstitute("JC", "<span class=\"center\">"); // center align begin
85  addTokenSubstitute("JL", "</span>"); // align end
86 }
87 
88 /** No descriptions */
89 char Filters::GbfToHtml::processText(sword::SWBuf& buf, const sword::SWKey * key, const sword::SWModule * module) {
90  GBFHTML::processText(buf, key, module);
91 
92  if (!module->isProcessEntryAttributes()) {
93  return 1; //no processing should be done, may happen in a search
94  }
95 
96  if (auto * const m =
97  CSwordBackend::instance().findModuleByName(module->getName()))
98  {
99  // only parse if the module has strongs or lemmas:
100  if (!m->has(CSwordModuleInfo::lemmas)
101  && !m->has(CSwordModuleInfo::morphTags)
103  return 1; //WARNING: Return already here
104  }
105 
106  //Am Anfang<WH07225> schuf<WH01254><WTH8804> Gott<WH0430> Himmel<WH08064> und<WT> Erde<WH0776>.
107  //A simple word<WT> means: No entry for this word "word"
108 
109 
110  //split the text into parts which end with the GBF tag marker for strongs/lemmas
111  QStringList list;
112  {
113  auto t = QString::fromUtf8(buf.c_str());
114  {
115  static QRegularExpression const tag(
116  QStringLiteral(R"PCRE(([.,;:]?<W[HGT][^>]*?>\s*)+)PCRE"));
117 
118  QRegularExpressionMatch match;
119  auto pos = t.indexOf(tag, 0, &match);
120  if (pos == -1) //no strong or morph code found in this text
121  return 1; //WARNING: Return already here
122  do {
123  auto const partLength = pos + match.capturedLength();
124  list.append(t.left(partLength));
125  t.remove(0, partLength);
126  pos = t.indexOf(tag, 0, &match);
127  } while (pos != -1);
128  }
129 
130  //append the trailing text to the list.
131  if (!t.isEmpty())
132  list.append(std::move(t));
133  }
134 
135  //list is now a list of words with 1-n Strongs at the end, which belong to this word.
136 
137  //now create the necessary HTML in list entries and concat them to the result
138  static QRegularExpression const tag(
139  QStringLiteral(R"PCRE(<W([HGT])([^>]*?)>)PCRE"));
140 
141  QString result;
142  for (auto & e : list) { // for each entry to process
143  //qWarning(e.latin1());
144 
145  //check if there is a word to which the strongs info belongs to.
146  //If yes, wrap that word with the strongs info
147  //If not, leave out the strongs info, because it can't be tight to a text
148  //Comparing the first char with < is not enough, because the tokenReplace is done already
149  //so there might be html tags already.
150  {
151  static QRegularExpression const re(
152  QStringLiteral(R"PCRE([.,;:])PCRE"));
153  if (e.trimmed().remove(re).left(2) == QStringLiteral("<W")) {
154  result += e;
155  continue;
156  }
157  }
158 
159  bool insertedTag = false;
160  bool hasLemmaAttr = false;
161  bool hasMorphAttr = false;
162 
163  int tagAttributeStart = -1;
164 
165  /* Try to find a strong number marker. Work on all strong/lemma tags in
166  this section, should be between 1-3 loops: */
167  QRegularExpressionMatch match;
168  for (auto pos = e.indexOf(tag, 0, &match);
169  pos != -1;
170  pos = e.indexOf(tag, pos, &match))
171  {
172  auto const isMorph = match.captured(1) == QStringLiteral("T");
173  auto const value =
174  isMorph
175  ? match.captured(2)
176  : match.captured(2).prepend(match.captured(1));
177 
178  if (value.isEmpty()) {
179  break;
180  }
181 
182  //insert the span
183  if (!insertedTag) { //we have to insert a new tag end and beginning, i.e. our first loop
184  e.replace(pos, match.capturedLength(), QStringLiteral("</span>"));
185  pos += 7;
186 
187  //skip blanks, commas, dots and stuff at the beginning, it doesn't belong to the morph code
188  auto rep =
189  QStringLiteral("<span %1=\"%2\">")
190  .arg(isMorph
191  ? QStringLiteral("morph")
192  : QStringLiteral("lemma"),
193  value);
194 
195  hasMorphAttr = isMorph;
196  hasLemmaAttr = !isMorph;
197 
198  int startPos = 0;
199  QChar c = e[startPos];
200 
201  while ((startPos < pos) && (c.isSpace() || c.isPunct())) {
202  ++startPos;
203 
204  c = e[startPos];
205  }
206 
207  tagAttributeStart = startPos + 6; //to point to the start of the attributes
208  pos += rep.size();
209  e.insert(startPos, std::move(rep));
210  }
211  else { //add the attribute to the existing tag
212  e.remove(pos, match.capturedLength());
213 
214  if (tagAttributeStart == -1) {
215  continue; //nothing valid found
216  }
217 
218  if ((!isMorph && hasLemmaAttr) || (isMorph && hasMorphAttr)) { //we append another attribute value, e.g. 3000 gets 3000|5000
219  //search the existing attribute start
220  auto const & attrRegExp =
221  [isMorph]{
222  if (isMorph) {
223  static QRegularExpression const re(
224  QStringLiteral("morph=\".+?(?=\")"));
225  return re;
226  } else {
227  static QRegularExpression const re(
228  QStringLiteral("lemma=\".+?(?=\")"));
229  return re;
230  }
231  }();
232  QRegularExpressionMatch match;
233  const int foundPos =
234  e.indexOf(attrRegExp, tagAttributeStart, &match);
235 
236  if (foundPos != -1) {
237  e.insert(foundPos + match.capturedLength(),
238  QStringLiteral("|") + value);
239  pos += value.length() + 1;
240 
241  hasLemmaAttr = !isMorph;
242  hasMorphAttr = isMorph;
243  }
244  }
245  else { //attribute was not yet inserted
246  hasMorphAttr = isMorph;
247  hasLemmaAttr = !isMorph;
248 
249  auto attr = QStringLiteral("%1=\"%2\" ")
250  .arg(isMorph
251  ? QStringLiteral("morph")
252  : QStringLiteral("lemma"),
253  value);
254  pos += attr.size();
255  e.insert(tagAttributeStart, std::move(attr));
256  }
257 
258  //tagAttributeStart remains the same
259  }
260 
261  insertedTag = true;
262  }
263 
264  result += e;
265  }
266 
267  if (!list.isEmpty())
268  buf = result.toUtf8().constData();
269 
270  return 1;
271 }
272 
273 namespace {
274 int hexDigitValue(char const hex) {
275  switch (hex) {
276  case '0': case '1': case '2': case '3': case '4':
277  case '5': case '6': case '7': case '8': case '9':
278  return hex - '0';
279  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
280  return hex - 'a' + 10;
281  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
282  return hex - 'A' + 10;
283  default:
284  BT_ASSERT(false && "Invalid hex code in GBF");
285  abort();
286  }
287 }
288 
289 char hexToChar(char const * const hex) {
290  int const first = hexDigitValue(hex[0u]);
291  return (first * 16u) + hexDigitValue(hex[1u]);
292 }
293 }
294 
295 bool Filters::GbfToHtml::handleToken(sword::SWBuf &buf, const char *token, sword::BasicFilterUserData *userData) {
296  if (!substituteToken(buf, token)) { // More than a simple replace
297  size_t const tokenLength = std::strlen(token);
298 
299  BT_ASSERT(dynamic_cast<UserData *>(userData));
300  UserData * const myUserData = static_cast<UserData *>(userData);
301  // Hack to be able to call stuff like Lang():
302  sword::SWModule const * const myModule =
303  const_cast<sword::SWModule *>(myUserData->module);
304 
305  /* We use several append calls because appendFormatted slows down
306  filtering, which should be fast. */
307 
308  if (!std::strncmp(token, "WG", 2u)
309  || !std::strncmp(token, "WH", 2u)
310  || !std::strncmp(token, "WT", 2u))
311  {
312  buf.append('<').append(token).append('>');
313  } else if (!std::strncmp(token, "RB", 2u)) {
314  myUserData->hasFootnotePreTag = true;
315  buf.append("<span class=\"footnotepre\">");
316  } else if (!std::strncmp(token, "RF", 2u)) {
317  if (myUserData->hasFootnotePreTag) {
318  // qWarning("inserted footnotepre end");
319  buf.append("</span>");
320  myUserData->hasFootnotePreTag = false;
321  }
322 
323  buf.append(" <span class=\"footnote\" note=\"")
324  .append(myModule->getName())
325  .append('/')
326  .append(myUserData->key->getShortText())
327  .append('/')
328  .append(QString::number(myUserData->swordFootnote).toUtf8().constData())
329  .append("\">*</span> ");
330  myUserData->swordFootnote++;
331  userData->suspendTextPassThru = true;
332  } else if (!std::strncmp(token, "Rf", 2u)) { // End of footnote
333  userData->suspendTextPassThru = false;
334  } else if (!std::strncmp(token, "FN", 2u)) {
335  // The end </font> tag is inserted in addTokenSubsitute
336  buf.append("<font face=\"");
337  for (size_t i = 2u; i < tokenLength; i++)
338  if (token[i] != '\"')
339  buf.append(token[i]);
340  buf.append("\">");
341  } else if (!std::strncmp(token, "CA", 2u)) { // ASCII value <CA##> in hex
342  BT_ASSERT(tokenLength == 4u);
343  buf.append(static_cast<char>(hexToChar(token + 2u)));
344  } else {
345  return GBFHTML::handleToken(buf, token, userData);
346  }
347  }
348 
349  return true;
350 }
#define BT_ASSERT(...)
Definition: btassert.h:17
static CSwordBackend & instance() noexcept
Definition: cswordbackend.h:98
static FilterOption const strongNumbers
static FilterOption const morphTags
static FilterOption const lemmas
short unsigned int swordFootnote
Definition: gbftohtml.h:55
bool handleToken(sword::SWBuf &buf, const char *token, sword::BasicFilterUserData *userData) override
Definition: gbftohtml.cpp:295
char processText(sword::SWBuf &buf, const sword::SWKey *key, const sword::SWModule *module=nullptr) override
Definition: gbftohtml.cpp:89
int hexDigitValue(char const hex)
Definition: gbftohtml.cpp:274
char hexToChar(char const *const hex)
Definition: gbftohtml.cpp:289
result append(std::move(e))