BibleTime
cswordmodulesearch.cpp
Go to the documentation of this file.
1 /*********
2 *
3 * In the name of the Father, and of the Son, and of the Holy Spirit.
4 *
5 * This file is part of BibleTime's source code, https://bibletime.info/
6 *
7 * Copyright 1999-2021 by the BibleTime developers.
8 * The BibleTime source code is licensed under the GNU General Public License
9 * version 2.0.
10 *
11 **********/
12 
13 #include "cswordmodulesearch.h"
14 
15 #include <algorithm>
16 #include <QChar>
17 #include <QDataStream>
18 #include <QRegularExpression>
19 #include <QRegularExpressionMatch>
20 #include <QStringList>
21 #include <QtCore>
22 #include "../util/btassert.h"
23 #include "config/btconfig.h"
25 #include "managers/cswordbackend.h"
26 
27 // Sword includes:
28 #include <listkey.h>
29 
30 
31 namespace CSwordModuleSearch {
32 
33 Results search(QString const & searchText,
34  BtConstModuleList const & modules,
35  sword::ListKey scope)
36 {
37  BT_ASSERT(std::all_of(modules.begin(),
38  modules.end(),
39  [](auto const * const m) { return m->hasIndex(); }));
40 
41  /// \todo What is the purpose of the following statement?
42  CSwordBackend::instance().setFilterOptions(btConfig().getFilterOptions());
43 
44  // Search module-by-module:
45  Results r;
46  r.reserve(modules.size());
47  for (auto const * const m : modules)
48  r.emplace_back(
49  ModuleSearchResult{m, m->searchIndexed(searchText, scope)});
50  return r;
51 }
52 
53 namespace {
54 
55 /** This function does a terrible job of trying to parse a CLucene query string
56  into a list of words (potentially containing the * or ? glob characters) for
57  word highlighting purposes. */
58 QStringList queryParser(QString const & queryString) {
59  QStringList tokenList;
60  {
61  QString token;
62  bool tokenHasLetterOrNumber = false;
63  bool tokenHasStar = false;
64  auto const pushToken =
65  [&] {
66  if (tokenHasLetterOrNumber
67  // Ignore empty tokens and those equivalent to glob (*):
68  || (!tokenHasStar && !token.isEmpty()))
69  tokenList.append(token);
70  token.clear();
71  tokenHasLetterOrNumber = false;
72  tokenHasStar = false;
73  };
74  for (int cnt = 0; cnt < queryString.size(); cnt++) {
75  auto const c = queryString[cnt];
76  if (c.isLetterOrNumber()) {
77  token.append(c);
78  tokenHasLetterOrNumber = true;
79  } else if (c == '*') {
80  token.append(c);
81  tokenHasStar = true;
82  } else if (c == '?') {
83  token.append(c);
84  } else if (c == '!' || c == '-' || c == '+') {
85  pushToken();
86  tokenList.append(c);
87  } else if ((c == '|' || c == '&')
88  && cnt + 1 < queryString.size()
89  && queryString[cnt + 1] == c)
90  {
91  pushToken();
92  tokenList.append(QString(2, c));
93  ++cnt;
94  } else { // Spaces and other unrecognized stuff act as separators:
95  pushToken();
96  }
97  }
98  pushToken();
99  }
100 
101  for (auto it = tokenList.begin(); it != tokenList.end(); ++it) {
102  int pos;
103  //-----------------------------------------------------------
104  // remove all the NOT(!) tokens - these do not need to be
105  // highlighted in the highlighter
106  //-----------------------------------------------------------
107  if (((*it) == '!')
108  || ((*it) == QStringLiteral("NOT"))
109  || ((*it) == '-'))
110  {
111  it = tokenList.erase(it);
112  if (it == tokenList.end())
113  break;
114  it = tokenList.erase(it);
115  if (it == tokenList.end())
116  break;
117  --it;
118  }
119  //-----------------------------------------------------------
120  // remove all the operator tokens - these do not need to be
121  // highlighted in the highlighter
122  //-----------------------------------------------------------
123  else if (((*it) == QStringLiteral("||"))
124  || ((*it) == QStringLiteral("OR"))
125  || ((*it) == '+')
126  || ((*it) == QStringLiteral("AND"))
127  || ((*it) == QStringLiteral("&&")))
128  {
129  it = tokenList.erase(it);
130  if (it == tokenList.end())
131  break;
132  --it;
133  }
134  // if the token contains a ^ then trim the remainder of the
135  // token from the ^
136  //What??? error: invalid conversion from 'const void*' to 'int'
137  // and how come "contains" returns bool but is used as int?
138  //else if ( (pos = (*it).contains("^")) >= 0 ) {
139  else if ( (pos = (*it).indexOf('^') ) >= 0 ) {
140  (*it) = (*it).left(pos - 1);
141  }
142  // if the token contains a ~ then trim the remainder of the
143  // token from the ~
144  else if ( (pos = (*it).indexOf('~') ) >= 0 ) {
145  (*it) = (*it).left(pos - 2) + '*';
146  }
147  }
148  return(tokenList);
149 }
150 
151 static auto const spaceRegexpString(QStringLiteral(R"PCRE(\s+)PCRE"));
152 static QRegularExpression const spaceRegexp(spaceRegexpString);
153 
154 } // anonymous namespace
155 
156 QString highlightSearchedText(QString const & content,
157  QString const & searchedText,
158  bool plainSearchedText)
159 {
160  static Qt::CaseSensitivity const cs = Qt::CaseInsensitive;
161 
162  static auto const skipIndexToTagEnd =
163  [](auto const & str, auto i) {
164  static QRegularExpression const re(
165  QStringLiteral(R"PCRE(["'>])PCRE"));
166  for (;;) {
167  i = str.indexOf(re, i);
168  if (i < 0)
169  return i;
170 
171  auto const match = str.at(i);
172  if (match == QLatin1Char('>'))
173  return i + 1;
174 
175  // Skip to end of quoted attribute value:
176  i = str.indexOf(match, ++i);
177  if (i < 0)
178  return i;
179  ++i;
180  }
181  };
182 
183  auto const bodyIndex =
184  [&content]{
185  static QRegularExpression const tagRe(
186  QStringLiteral(R"PCRE(<body(>|\\s))PCRE"));
187  auto const i = content.indexOf(tagRe);
188  return (i < 0) ? 0 : skipIndexToTagEnd(content, i + 5);
189  }();
190  if (bodyIndex < 0)
191  return content;
192 
193  auto ret = content.mid(bodyIndex);
194 
195  if (!plainSearchedText) {
196  // find the strongs search lemma and highlight it
197  for (auto const & newSearchText
198  : searchedText.split(spaceRegexp, Qt::SkipEmptyParts))
199  {
200  // strong search text index for finding "strong:"
201  int sstIndex = newSearchText.indexOf(QStringLiteral("strong:"));
202  if (sstIndex == -1)
203  continue;
204 
205  // Get the strongs number from the search text.
206  // First, find the first space after "strong:"
207  sstIndex = sstIndex + 7;
208 
209  // set the start index to the start of <body>
210  int strongIndex = 0;
211 
212  // get the strongs number -> the text following "strong:" to the end
213  // of the string. find all the "lemma=" inside the the content
214  while ((strongIndex =
215  ret.indexOf(QStringLiteral("lemma="), strongIndex, cs))
216  != -1)
217  {
218  // get the strongs number after the lemma and compare it with
219  // the strongs number we are looking for
220  int const idx1 = ret.indexOf('"', strongIndex) + 1;
221  int const idx2 = ret.indexOf('"', idx1 + 1);
222 
223  // this is interesting because we could have a strongs number
224  // like G3218|G300. To handle this we will use some extra cpu
225  // cycles and do a partial match against the lemmaText
226  if (ret.mid(idx1, idx2 - idx1)
227  .contains(newSearchText.mid(sstIndex, -1)))
228  {
229  static auto const rep3 =
230  QStringLiteral(R"HTML(class="highlightwords" )HTML");
231  // strongs number is found now we need to highlight it
232  // I believe the easiest way is to insert rep3 just before
233  // "lemma="
234  ret = ret.insert(strongIndex, rep3); /// \bug ?
235  strongIndex += rep3.length();
236  }
237  strongIndex += 6; // 6 is the length of "lemma="
238  }
239  }
240  //---------------------------------------------------------------------
241  // now that the strong: stuff is out of the way continue with
242  // other search options
243  //---------------------------------------------------------------------
244  }
245 
246  QRegularExpression highlightRegex; // Construct highLightRegex:
247  if (plainSearchedText) {
248  auto words = searchedText.split(spaceRegexp, Qt::SkipEmptyParts);
249  for (auto & word : words)
250  word = QRegularExpression::escape(word);
251  highlightRegex = QRegularExpression(words.join(spaceRegexpString));
252  } else {
253  QString wordsRegexString;
254  for (auto const & word : queryParser(searchedText)) {
255  QString wordRegexString;
256  auto const wordSize = word.size();
257  wordRegexString.reserve(wordSize + 3);
258 
259  static QRegularExpression const wildCardRegex(
260  QStringLiteral(R"PCRE([*?])PCRE"));
261  auto fragmentEnd = word.indexOf(wildCardRegex);
262  decltype(fragmentEnd) fragmentStart = 0;
263  while (fragmentEnd >= 0) {
264  if (auto const fragmentSize = fragmentEnd - fragmentStart)
265  wordRegexString.append(
266  QRegularExpression::escape(
267  word.mid(fragmentStart, fragmentSize)));
268  wordRegexString.append(word.at(fragmentEnd) == QLatin1Char('*')
269  ? QStringLiteral(R"PCRE(\S*?)PCRE")
270  : QStringLiteral(R"PCRE(\S)PCRE"));
272  fragmentEnd = word.indexOf(wildCardRegex, fragmentStart);
273  }
274  wordRegexString.append(
275  QRegularExpression::escape(word.mid(fragmentStart)));
276 
277  if (!wordsRegexString.isEmpty())
278  wordsRegexString.append(QLatin1Char('|'));
279  wordsRegexString.append(wordRegexString);
280  }
282  QRegularExpression(
283  QStringLiteral(R"PCRE(\b(%1)\b)PCRE").arg(wordsRegexString),
284  QRegularExpression::CaseInsensitiveOption);
285  }
286 
287  QStringList r(content.left(bodyIndex));
288 
289  // Iterate over HTML text fragments:
290  auto fragmentStart = 0;
291  auto fragmentEnd = ret.indexOf(QLatin1Char('<'), fragmentStart);
292  decltype(ret.size()) fragmentSize =
293  (fragmentEnd < 0 ? ret.size() : fragmentEnd) - fragmentStart;
294  for (QRegularExpressionMatch match;;) {
295  if (fragmentSize > 0) {
296  #if (QT_VERSION < QT_VERSION_CHECK(6, 0, 0))
297  auto const fragment = ret.mid(fragmentStart, fragmentSize);
298  #else
299  QStringView const fragment(ret.constData() + fragmentStart,
300  fragmentSize);
301  #endif
302  decltype(fragmentStart) searchStart = 0;
303  for (;;) {
304  auto i = fragment.indexOf(highlightRegex, searchStart, &match);
305  if (i < 0) {
306  #if (QT_VERSION < QT_VERSION_CHECK(6, 0, 0))
307  r << fragment.mid(searchStart);
308  #else
309  r << fragment.mid(searchStart).toString();
310  #endif
311  break;
312  }
313 
314  if (auto const noMatchSize = i - searchStart) {
315  #if (QT_VERSION < QT_VERSION_CHECK(6, 0, 0))
316  r << fragment.mid(searchStart, noMatchSize);
317  #else
318  r << fragment.mid(searchStart, noMatchSize).toString();
319  #endif
320  }
321  r << QStringLiteral(R"HTML(<span class="highlightwords">)HTML")
322  << match.captured()
323  << QStringLiteral(R"HTML(</span>)HTML");
324  searchStart = i + match.capturedLength();
325  }
326  }
327 
328  if (fragmentEnd < 0)
329  break;
330  fragmentStart = skipIndexToTagEnd(ret, fragmentEnd + 1);
331  r << ret.mid(fragmentEnd, fragmentStart - fragmentEnd);
332  fragmentEnd = ret.indexOf(QLatin1Char('<'), fragmentStart);
333  fragmentSize =
334  (fragmentEnd < -1) ? ret.size() : (fragmentEnd - fragmentStart);
335  }
336 
337  return r.join(QString());
338 }
339 
340 QString prepareSearchText(QString const & orig, SearchType const searchType) {
341  if (searchType == FullType)
342  return orig;
343  auto words = orig.split(spaceRegexp, Qt::SkipEmptyParts);
344  static QRegularExpression const escapeRe(
345  QLatin1String(R"PCRE(([\\+\-\!\‍(\)\:\^\]\[\"\{\}\~\*\?\|\&]))PCRE"));
346  for (auto & word : words)
347  word = QStringLiteral(R"("%1")").arg(word.replace(escapeRe, "\\\\1"));
348  return words.join(searchType == OrType
349  ? QStringLiteral(" OR ")
350  : QStringLiteral(" AND "));
351 }
352 
353 } // namespace CSwordModuleSearch
354 
355 QDataStream &operator<<(QDataStream &out, const CSwordModuleSearch::SearchType &searchType) {
356  out << static_cast<qint8>(searchType);
357  return out;
358 }
359 
360 QDataStream &operator>>(QDataStream &in, CSwordModuleSearch::SearchType &searchType) {
361  qint8 i;
362  in >> i;
363  searchType = static_cast<CSwordModuleSearch::SearchType>(i);
364  return in;
365 }
#define BT_ASSERT(...)
Definition: btassert.h:17
BtConfig & btConfig()
This is a shortchand for BtConfig::getInstance().
Definition: btconfig.h:305
QList< CSwordModuleInfo const * > BtConstModuleList
Definition: btmodulelist.h:21
static CSwordBackend & instance() noexcept
Definition: cswordbackend.h:98
void setFilterOptions(const FilterOptions &options)
auto fragmentEnd
QRegularExpression highlightRegex
QDataStream & operator<<(QDataStream &out, const CSwordModuleSearch::SearchType &searchType)
for(auto const &word :queryParser(searchedText))
decltype(ret.size()) fragmentSize
auto fragmentStart
QDataStream & operator>>(QDataStream &in, CSwordModuleSearch::SearchType &searchType)
QStringList r(content.left(bodyIndex))
static QRegularExpression const spaceRegexp(spaceRegexpString)
static auto const spaceRegexpString(QStringLiteral(R"PCRE(\s+)PCRE"))
QString prepareSearchText(QString const &orig, SearchType const searchType)
QString highlightSearchedText(QString const &content, QString const &searchedText, bool plainSearchedText)
Results search(QString const &searchText, BtConstModuleList const &modules, sword::ListKey scope)
std::vector< ModuleSearchResult > Results