BibleTime
gbftohtml.cpp
Go to the documentation of this file.
1/*********
2*
3* In the name of the Father, and of the Son, and of the Holy Spirit.
4*
5* This file is part of BibleTime's source code, https://bibletime.info/
6*
7* Copyright 1999-2025 by the BibleTime developers.
8* The BibleTime source code is licensed under the GNU General Public License
9* version 2.0.
10*
11**********/
12
13#include "gbftohtml.h"
14
15#include <cstdlib>
16#include <cstring>
17#include <QByteArray>
18#include <QChar>
19#include <QRegularExpression>
20#include <QRegularExpressionMatch>
21#include <QString>
22#include <QStringList>
23#include <utility>
24#include "../../util/btassert.h"
25#include "../drivers/cswordmoduleinfo.h"
26#include "../managers/cswordbackend.h"
27
28// Sword includes:
29#include <gbfhtml.h>
30#include <swbasicfilter.h>
31#include <swbuf.h>
32#include <swkey.h>
33#include <swmodule.h>
34
35
37
38 setEscapeStringCaseSensitive(true);
39 setPassThruUnknownEscapeString(true); //the HTML widget will render the HTML escape codes
40
41 removeTokenSubstitute("Rf");
42 // addTokenSubstitute("RB", "<span>"); //start of a footnote with embedded text
43
44 addTokenSubstitute("FI", "<span class=\"italic\">"); // italics begin
45 addTokenSubstitute("Fi", "</span>");
46
47 addTokenSubstitute("FB", "<span class=\"bold\">"); // bold begin
48 addTokenSubstitute("Fb", "</span>");
49
50 addTokenSubstitute("FR", "<span class=\"jesuswords\">");
51 addTokenSubstitute("Fr", "</span>");
52
53 addTokenSubstitute("FU", "<u>"); // underline begin
54 addTokenSubstitute("Fu", "</u>");
55
56 addTokenSubstitute("FO", "<span class=\"quotation\">"); // Old Testament quote begin
57 addTokenSubstitute("Fo", "</span>");
58
59
60 addTokenSubstitute("FS", "<span class=\"sup\">"); // Superscript begin// Subscript begin
61 addTokenSubstitute("Fs", "</span>");
62
63 addTokenSubstitute("FV", "<span class=\"sub\">"); // Subscript begin
64 addTokenSubstitute("Fv", "</span>");
65
66 addTokenSubstitute("TT", "<div class=\"booktitle\">");
67 addTokenSubstitute("Tt", "</div>");
68
69 addTokenSubstitute("TS", "<div class=\"sectiontitle\">");
70 addTokenSubstitute("Ts", "</div>");
71
72 //addTokenSubstitute("PP", "<span class=\"poetry\">"); // poetry begin
73 //addTokenSubstitute("Pp", "</span>");
74
75
76 addTokenSubstitute("Fn", "</font>"); // font end
77 addTokenSubstitute("CL", "<br/>"); // new line
78 addTokenSubstitute("CM", "<br/>"); // paragraph <!P> is a non showing comment that can be changed in the front end to <P> if desired
79
80 addTokenSubstitute("CG", "&gt;"); // literal greater-than sign
81 addTokenSubstitute("CT", "&lt;"); // literal less-than sign
82
83 addTokenSubstitute("JR", "<span class=\"right\">"); // right align begin
84 addTokenSubstitute("JC", "<span class=\"center\">"); // center align begin
85 addTokenSubstitute("JL", "</span>"); // align end
86}
87
88/** No descriptions */
89char Filters::GbfToHtml::processText(sword::SWBuf& buf, const sword::SWKey * key, const sword::SWModule * module) {
90 GBFHTML::processText(buf, key, module);
91
92 if (!module->isProcessEntryAttributes()) {
93 return 1; //no processing should be done, may happen in a search
94 }
95
96 if (auto * const m =
97 CSwordBackend::instance().findModuleByName(module->getName()))
98 {
99 // only parse if the module has strongs or lemmas:
100 if (!m->has(CSwordModuleInfo::lemmas)
103 return 1; //WARNING: Return already here
104 }
105
106 //Am Anfang<WH07225> schuf<WH01254><WTH8804> Gott<WH0430> Himmel<WH08064> und<WT> Erde<WH0776>.
107 //A simple word<WT> means: No entry for this word "word"
108
109
110 //split the text into parts which end with the GBF tag marker for strongs/lemmas
111 QStringList list;
112 {
113 auto t = QString::fromUtf8(buf.c_str());
114 {
115 static QRegularExpression const tag(
116 QStringLiteral(R"PCRE(([.,;:]?<W[HGT][^>]*?>\s*)+)PCRE"));
117
118 QRegularExpressionMatch match;
119 auto pos = t.indexOf(tag, 0, &match);
120 if (pos == -1) //no strong or morph code found in this text
121 return 1; //WARNING: Return already here
122 do {
123 auto const partLength = pos + match.capturedLength();
124 list.append(t.left(partLength));
125 t.remove(0, partLength);
126 pos = t.indexOf(tag, 0, &match);
127 } while (pos != -1);
128 }
129
130 //append the trailing text to the list.
131 if (!t.isEmpty())
132 list.append(std::move(t));
133 }
134
135 //list is now a list of words with 1-n Strongs at the end, which belong to this word.
136
137 //now create the necessary HTML in list entries and concat them to the result
138 static QRegularExpression const tag(
139 QStringLiteral(R"PCRE(<W([HGT])([^>]*?)>)PCRE"));
140
141 QString result;
142 for (auto & e : list) { // for each entry to process
143 //qWarning(e.latin1());
144
145 //check if there is a word to which the strongs info belongs to.
146 //If yes, wrap that word with the strongs info
147 //If not, leave out the strongs info, because it can't be tight to a text
148 //Comparing the first char with < is not enough, because the tokenReplace is done already
149 //so there might be html tags already.
150 {
151 static QRegularExpression const re(
152 QStringLiteral(R"PCRE([.,;:])PCRE"));
153 if (e.trimmed().remove(re).left(2) == QStringLiteral("<W")) {
154 result += e;
155 continue;
156 }
157 }
158
159 bool insertedTag = false;
160 bool hasLemmaAttr = false;
161 bool hasMorphAttr = false;
162
163 int tagAttributeStart = -1;
164
165 /* Try to find a strong number marker. Work on all strong/lemma tags in
166 this section, should be between 1-3 loops: */
167 QRegularExpressionMatch match;
168 for (auto pos = e.indexOf(tag, 0, &match);
169 pos != -1;
170 pos = e.indexOf(tag, pos, &match))
171 {
172 auto const isMorph = match.captured(1) == QStringLiteral("T");
173 auto const value =
174 isMorph
175 ? match.captured(2)
176 : match.captured(2).prepend(match.captured(1));
177
178 if (value.isEmpty()) {
179 break;
180 }
181
182 //insert the span
183 if (!insertedTag) { //we have to insert a new tag end and beginning, i.e. our first loop
184 e.replace(pos, match.capturedLength(), QStringLiteral("</span>"));
185 pos += 7;
186
187 //skip blanks, commas, dots and stuff at the beginning, it doesn't belong to the morph code
188 auto rep =
189 QStringLiteral("<span %1=\"%2\">")
190 .arg(isMorph
191 ? QStringLiteral("morph")
192 : QStringLiteral("lemma"),
193 value);
194
195 hasMorphAttr = isMorph;
196 hasLemmaAttr = !isMorph;
197
198 int startPos = 0;
199 QChar c = e[startPos];
200
201 while ((startPos < pos) && (c.isSpace() || c.isPunct())) {
202 ++startPos;
203
204 c = e[startPos];
205 }
206
207 tagAttributeStart = startPos + 6; //to point to the start of the attributes
208 pos += rep.size();
209 e.insert(startPos, std::move(rep));
210 }
211 else { //add the attribute to the existing tag
212 e.remove(pos, match.capturedLength());
213
214 if (tagAttributeStart == -1) {
215 continue; //nothing valid found
216 }
217
218 if ((!isMorph && hasLemmaAttr) || (isMorph && hasMorphAttr)) { //we append another attribute value, e.g. 3000 gets 3000|5000
219 //search the existing attribute start
220 auto const & attrRegExp =
221 [isMorph]{
222 if (isMorph) {
223 static QRegularExpression const re(
224 QStringLiteral("morph=\".+?(?=\")"));
225 return re;
226 } else {
227 static QRegularExpression const re(
228 QStringLiteral("lemma=\".+?(?=\")"));
229 return re;
230 }
231 }();
232 QRegularExpressionMatch match;
233 const int foundPos =
234 e.indexOf(attrRegExp, tagAttributeStart, &match);
235
236 if (foundPos != -1) {
237 e.insert(foundPos + match.capturedLength(),
238 QStringLiteral("|") + value);
239 pos += value.length() + 1;
240
241 hasLemmaAttr = !isMorph;
242 hasMorphAttr = isMorph;
243 }
244 }
245 else { //attribute was not yet inserted
246 hasMorphAttr = isMorph;
247 hasLemmaAttr = !isMorph;
248
249 auto attr = QStringLiteral("%1=\"%2\" ")
250 .arg(isMorph
251 ? QStringLiteral("morph")
252 : QStringLiteral("lemma"),
253 value);
254 pos += attr.size();
255 e.insert(tagAttributeStart, std::move(attr));
256 }
257
258 //tagAttributeStart remains the same
259 }
260
261 insertedTag = true;
262 }
263
264 result += e;
265 }
266
267 if (!list.isEmpty())
268 buf = result.toUtf8().constData();
269
270 return 1;
271}
272
273namespace {
274int hexDigitValue(char const hex) {
275 switch (hex) {
276 case '0': case '1': case '2': case '3': case '4':
277 case '5': case '6': case '7': case '8': case '9':
278 return hex - '0';
279 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
280 return hex - 'a' + 10;
281 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
282 return hex - 'A' + 10;
283 default:
284 BT_ASSERT(false && "Invalid hex code in GBF");
285 abort();
286 }
287}
288
289char hexToChar(char const * const hex) {
290 int const first = hexDigitValue(hex[0u]);
291 return (first * 16u) + hexDigitValue(hex[1u]);
292}
293}
294
295bool Filters::GbfToHtml::handleToken(sword::SWBuf &buf, const char *token, sword::BasicFilterUserData *userData) {
296 if (!substituteToken(buf, token)) { // More than a simple replace
297 size_t const tokenLength = std::strlen(token);
298
299 BT_ASSERT(dynamic_cast<UserData *>(userData));
300 UserData * const myUserData = static_cast<UserData *>(userData);
301 // Hack to be able to call stuff like Lang():
302 sword::SWModule const * const myModule =
303 const_cast<sword::SWModule *>(myUserData->module);
304
305 /* We use several append calls because appendFormatted slows down
306 filtering, which should be fast. */
307
308 if (!std::strncmp(token, "WG", 2u)
309 || !std::strncmp(token, "WH", 2u)
310 || !std::strncmp(token, "WT", 2u))
311 {
312 buf.append('<').append(token).append('>');
313 } else if (!std::strncmp(token, "RB", 2u)) {
314 myUserData->hasFootnotePreTag = true;
315 buf.append("<span class=\"footnotepre\">");
316 } else if (!std::strncmp(token, "RF", 2u)) {
317 if (myUserData->hasFootnotePreTag) {
318 // qWarning("inserted footnotepre end");
319 buf.append("</span>");
320 myUserData->hasFootnotePreTag = false;
321 }
322
323 buf.append(" <span class=\"footnote\" note=\"")
324 .append(myModule->getName())
325 .append('/')
326 .append(myUserData->key->getShortText())
327 .append('/')
328 .append(QString::number(myUserData->swordFootnote).toUtf8().constData())
329 .append("\">*</span> ");
330 myUserData->swordFootnote++;
331 userData->suspendTextPassThru = true;
332 } else if (!std::strncmp(token, "Rf", 2u)) { // End of footnote
333 userData->suspendTextPassThru = false;
334 } else if (!std::strncmp(token, "FN", 2u)) {
335 // The end </font> tag is inserted in addTokenSubsitute
336 buf.append("<font face=\"");
337 for (size_t i = 2u; i < tokenLength; i++)
338 if (token[i] != '\"')
339 buf.append(token[i]);
340 buf.append("\">");
341 } else if (!std::strncmp(token, "CA", 2u)) { // ASCII value <CA##> in hex
342 BT_ASSERT(tokenLength == 4u);
343 buf.append(static_cast<char>(hexToChar(token + 2u)));
344 } else {
345 return GBFHTML::handleToken(buf, token, userData);
346 }
347 }
348
349 return true;
350}
#define BT_ASSERT(...)
Definition btassert.h:17
static CSwordBackend & instance() noexcept
static FilterOption const strongNumbers
static FilterOption const morphTags
static FilterOption const lemmas
short unsigned int swordFootnote
Definition gbftohtml.h:55
bool handleToken(sword::SWBuf &buf, const char *token, sword::BasicFilterUserData *userData) override
char processText(sword::SWBuf &buf, const sword::SWKey *key, const sword::SWModule *module=nullptr) override
Definition gbftohtml.cpp:89
int hexDigitValue(char const hex)
char hexToChar(char const *const hex)