BibleTime
cswordmodulesearch.cpp
Go to the documentation of this file.
1/*********
2*
3* In the name of the Father, and of the Son, and of the Holy Spirit.
4*
5* This file is part of BibleTime's source code, https://bibletime.info/
6*
7* Copyright 1999-2025 by the BibleTime developers.
8* The BibleTime source code is licensed under the GNU General Public License
9* version 2.0.
10*
11**********/
12
13#include "cswordmodulesearch.h"
14
15#include <algorithm>
16#include <QChar>
17#include <QDataStream>
18#include <QRegularExpression>
19#include <QRegularExpressionMatch>
20#include <QStringList>
21#include <QtCore>
22#include "../util/btassert.h"
23#include "config/btconfig.h"
26
27// Sword includes:
28#include <listkey.h>
29
30
32
33Results search(QString const & searchText,
34 BtConstModuleList const & modules,
35 sword::ListKey scope)
36{
37 BT_ASSERT(std::all_of(modules.begin(),
38 modules.end(),
39 [](auto const * const m) { return m->hasIndex(); }));
40
41 /// \todo What is the purpose of the following statement?
43
44 // Search module-by-module:
45 Results r;
46 r.reserve(modules.size());
47 for (auto const * const m : modules)
48 r.emplace_back(
49 ModuleSearchResult{m, m->searchIndexed(searchText, scope)});
50 return r;
51}
52
53namespace {
54
55/** This function does a terrible job of trying to parse a CLucene query string
56 into a list of words (potentially containing the * or ? glob characters) for
57 word highlighting purposes. */
58QStringList queryParser(QString const & queryString) {
59 QStringList tokenList;
60 {
61 QString token;
62 bool tokenHasLetterOrNumber = false;
63 bool tokenHasStar = false;
64 auto const pushToken =
65 [&] {
66 if (tokenHasLetterOrNumber
67 // Ignore empty tokens and those equivalent to glob (*):
68 || (!tokenHasStar && !token.isEmpty()))
69 tokenList.append(token);
70 token.clear();
71 tokenHasLetterOrNumber = false;
72 tokenHasStar = false;
73 };
74 for (int cnt = 0; cnt < queryString.size(); cnt++) {
75 auto const c = queryString[cnt];
76 if (c.isLetterOrNumber()) {
77 token.append(c);
78 tokenHasLetterOrNumber = true;
79 } else if (c == '*') {
80 token.append(c);
81 tokenHasStar = true;
82 } else if (c == '?') {
83 token.append(c);
84 } else if (c == '!' || c == '-' || c == '+') {
85 pushToken();
86 tokenList.append(c);
87 } else if ((c == '|' || c == '&')
88 && cnt + 1 < queryString.size()
89 && queryString[cnt + 1] == c)
90 {
91 pushToken();
92 tokenList.append(QString(2, c));
93 ++cnt;
94 } else { // Spaces and other unrecognized stuff act as separators:
95 pushToken();
96 }
97 }
98 pushToken();
99 }
100
101 for (auto it = tokenList.begin(); it != tokenList.end(); ++it) {
102 int pos;
103 //-----------------------------------------------------------
104 // remove all the NOT(!) tokens - these do not need to be
105 // highlighted in the highlighter
106 //-----------------------------------------------------------
107 if (((*it) == '!')
108 || ((*it) == QStringLiteral("NOT"))
109 || ((*it) == '-'))
110 {
111 it = tokenList.erase(it);
112 if (it == tokenList.end())
113 break;
114 it = tokenList.erase(it);
115 if (it == tokenList.end())
116 break;
117 --it;
118 }
119 //-----------------------------------------------------------
120 // remove all the operator tokens - these do not need to be
121 // highlighted in the highlighter
122 //-----------------------------------------------------------
123 else if (((*it) == QStringLiteral("||"))
124 || ((*it) == QStringLiteral("OR"))
125 || ((*it) == '+')
126 || ((*it) == QStringLiteral("AND"))
127 || ((*it) == QStringLiteral("&&")))
128 {
129 it = tokenList.erase(it);
130 if (it == tokenList.end())
131 break;
132 --it;
133 }
134 // if the token contains a ^ then trim the remainder of the
135 // token from the ^
136 //What??? error: invalid conversion from 'const void*' to 'int'
137 // and how come "contains" returns bool but is used as int?
138 //else if ( (pos = (*it).contains("^")) >= 0 ) {
139 else if ( (pos = (*it).indexOf('^') ) >= 0 ) {
140 (*it) = (*it).left(pos - 1);
141 }
142 // if the token contains a ~ then trim the remainder of the
143 // token from the ~
144 else if ( (pos = (*it).indexOf('~') ) >= 0 ) {
145 (*it) = (*it).left(pos - 2) + '*';
146 }
147 }
148 return(tokenList);
149}
150
151static auto const spaceRegexpString(QStringLiteral(R"PCRE(\s+)PCRE"));
152static QRegularExpression const spaceRegexp(spaceRegexpString);
153
154} // anonymous namespace
155
156QString highlightSearchedText(QString const & content,
157 QString const & searchedText,
158 bool plainSearchedText)
159{
160 static Qt::CaseSensitivity const cs = Qt::CaseInsensitive;
161
162 static auto const skipIndexToTagEnd =
163 [](auto const & str, auto i) {
164 static QRegularExpression const re(
165 QStringLiteral(R"PCRE(["'>])PCRE"));
166 for (;;) {
167 i = str.indexOf(re, i);
168 if (i < 0)
169 return i;
170
171 auto const match = str.at(i);
172 if (match == QLatin1Char('>'))
173 return i + 1;
174
175 // Skip to end of quoted attribute value:
176 i = str.indexOf(match, ++i);
177 if (i < 0)
178 return i;
179 ++i;
180 }
181 };
182
183 auto const bodyIndex =
184 [&content]{
185 static QRegularExpression const tagRe(
186 QStringLiteral(R"PCRE(<body(>|\\s))PCRE"));
187 auto const i = content.indexOf(tagRe);
188 return (i < 0) ? 0 : skipIndexToTagEnd(content, i + 5);
189 }();
190 if (bodyIndex < 0)
191 return content;
192
193 auto ret = content.mid(bodyIndex);
194
195 if (!plainSearchedText) {
196 // find the strongs search lemma and highlight it
197 for (auto const & newSearchText
198 : searchedText.split(spaceRegexp, Qt::SkipEmptyParts))
199 {
200 // strong search text index for finding "strong:"
201 int sstIndex = newSearchText.indexOf(QStringLiteral("strong:"));
202 if (sstIndex == -1)
203 continue;
204
205 // Get the strongs number from the search text.
206 // First, find the first space after "strong:"
207 sstIndex = sstIndex + 7;
208
209 // set the start index to the start of <body>
210 int strongIndex = 0;
211
212 // get the strongs number -> the text following "strong:" to the end
213 // of the string. find all the "lemma=" inside the the content
214 while ((strongIndex =
215 ret.indexOf(QStringLiteral("lemma="), strongIndex, cs))
216 != -1)
217 {
218 // get the strongs number after the lemma and compare it with
219 // the strongs number we are looking for
220 int const idx1 = ret.indexOf('"', strongIndex) + 1;
221 int const idx2 = ret.indexOf('"', idx1 + 1);
222
223 // this is interesting because we could have a strongs number
224 // like G3218|G300. To handle this we will use some extra cpu
225 // cycles and do a partial match against the lemmaText
226 if (ret.mid(idx1, idx2 - idx1)
227 .contains(newSearchText.mid(sstIndex, -1)))
228 {
229 static auto const rep3 =
230 QStringLiteral(R"HTML(class="highlightwords" )HTML");
231 // strongs number is found now we need to highlight it
232 // I believe the easiest way is to insert rep3 just before
233 // "lemma="
234 ret = ret.insert(strongIndex, rep3); /// \bug ?
235 strongIndex += rep3.length();
236 }
237 strongIndex += 6; // 6 is the length of "lemma="
238 }
239 }
240 //---------------------------------------------------------------------
241 // now that the strong: stuff is out of the way continue with
242 // other search options
243 //---------------------------------------------------------------------
244 }
245
246 QRegularExpression highlightRegex; // Construct highLightRegex:
247 if (plainSearchedText) {
248 auto words = searchedText.split(spaceRegexp, Qt::SkipEmptyParts);
249 if (words.isEmpty())
250 return content.left(bodyIndex) + ret;
251 for (auto & word : words)
252 word = QRegularExpression::escape(word);
253 highlightRegex = QRegularExpression(words.join(spaceRegexpString));
254 } else {
255 auto const query = queryParser(searchedText);
256 if (query.isEmpty())
257 return content.left(bodyIndex) + ret;
259 for (auto const & word : query) {
260 QString wordRegexString;
261 auto const wordSize = word.size();
262 wordRegexString.reserve(wordSize + 3);
263
264 static QRegularExpression const wildCardRegex(
265 QStringLiteral(R"PCRE([*?])PCRE"));
266 auto fragmentEnd = word.indexOf(wildCardRegex);
267 decltype(fragmentEnd) fragmentStart = 0;
268 while (fragmentEnd >= 0) {
269 if (auto const fragmentSize = fragmentEnd - fragmentStart)
270 wordRegexString.append(
271 QRegularExpression::escape(
272 word.mid(fragmentStart, fragmentSize)));
273 wordRegexString.append(word.at(fragmentEnd) == QLatin1Char('*')
274 ? QStringLiteral(R"PCRE(\S*?)PCRE")
275 : QStringLiteral(R"PCRE(\S)PCRE"));
277 fragmentEnd = word.indexOf(wildCardRegex, fragmentStart);
278 }
279 wordRegexString.append(
280 QRegularExpression::escape(word.mid(fragmentStart)));
281
282 if (!wordsRegexString.isEmpty())
283 wordsRegexString.append(QLatin1Char('|'));
284 wordsRegexString.append(wordRegexString);
285 }
287 QRegularExpression(
288 QStringLiteral(R"PCRE(\b(%1)\b)PCRE").arg(wordsRegexString),
289 QRegularExpression::CaseInsensitiveOption);
290 }
291
292 QStringList r(content.left(bodyIndex));
293
294 // Iterate over HTML text fragments:
296 auto fragmentEnd = ret.indexOf(QLatin1Char('<'), fragmentStart);
297 decltype(ret.size()) fragmentSize =
298 (fragmentEnd < 0 ? ret.size() : fragmentEnd) - fragmentStart;
299 for (QRegularExpressionMatch match;;) {
300 if (fragmentSize > 0) {
301 QStringView const fragment(ret.constData() + fragmentStart,
303 decltype(fragmentStart) searchStart = 0;
304 for (;;) {
305 auto i = fragment.indexOf(highlightRegex, searchStart, &match);
306 if (i < 0) {
307 r << fragment.mid(searchStart).toString();
308 break;
309 }
310
311 if (auto const noMatchSize = i - searchStart) {
312 r << fragment.mid(searchStart, noMatchSize).toString();
313 }
314 r << QStringLiteral(R"HTML(<span class="highlightwords">)HTML")
315 << match.captured()
316 << QStringLiteral(R"HTML(</span>)HTML");
317 searchStart = i + match.capturedLength();
318 }
319 }
320
321 if (fragmentEnd < 0)
322 break;
323 fragmentStart = skipIndexToTagEnd(ret, fragmentEnd + 1);
324 r << ret.mid(fragmentEnd, fragmentStart - fragmentEnd);
325 fragmentEnd = ret.indexOf(QLatin1Char('<'), fragmentStart);
327 (fragmentEnd < -1) ? ret.size() : (fragmentEnd - fragmentStart);
328 }
329
330 return r.join(QString());
331}
332
333QString prepareSearchText(QString const & orig, SearchType const searchType) {
334 if (searchType == FullType)
335 return orig;
336 auto words = orig.split(spaceRegexp, Qt::SkipEmptyParts);
337 static QRegularExpression const escapeRe(
338 QLatin1String(R"PCRE(([\\+\-\!\‍(\)\:\^\]\[\"\{\}\~\*\?\|\&]))PCRE"));
339 for (auto & word : words)
340 word = QStringLiteral(R"("%1")").arg(word.replace(escapeRe, "\\\\1"));
341 return words.join(searchType == OrType
342 ? QStringLiteral(" OR ")
343 : QStringLiteral(" AND "));
344}
345
346} // namespace CSwordModuleSearch
347
348QDataStream &operator<<(QDataStream &out, const CSwordModuleSearch::SearchType &searchType) {
349 out << static_cast<qint8>(searchType);
350 return out;
351}
352
353QDataStream &operator>>(QDataStream &in, CSwordModuleSearch::SearchType &searchType) {
354 qint8 i;
355 in >> i;
356 searchType = static_cast<CSwordModuleSearch::SearchType>(i);
357 return in;
358}
#define BT_ASSERT(...)
Definition btassert.h:17
BtConfig & btConfig()
This is a shortchand for BtConfig::getInstance().
Definition btconfig.h:305
QList< CSwordModuleInfo const * > BtConstModuleList
static CSwordBackend & instance() noexcept
void setFilterOptions(const FilterOptions &options)
auto fragmentEnd
QDataStream & operator>>(QDataStream &in, CSwordModuleSearch::SearchType &searchType)
QString wordsRegexString
QRegularExpression highlightRegex
QDataStream & operator<<(QDataStream &out, const CSwordModuleSearch::SearchType &searchType)
QString prepareSearchText(QString const &orig, SearchType const searchType)
decltype(ret.size()) fragmentSize
auto fragmentStart
QStringList r(content.left(bodyIndex))
static QRegularExpression const spaceRegexp(spaceRegexpString)
static auto const spaceRegexpString(QStringLiteral(R"PCRE(\s+)PCRE"))
QString highlightSearchedText(QString const &content, QString const &searchedText, bool plainSearchedText)
Results search(QString const &searchText, BtConstModuleList const &modules, sword::ListKey scope)
std::vector< ModuleSearchResult > Results