OpenTREP Logo  0.6.0
C++ Open Travel Request Parsing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Filter.cpp
Go to the documentation of this file.
1 // //////////////////////////////////////////////////////////////////////
2 // Import section
3 // //////////////////////////////////////////////////////////////////////
4 // STL
5 #include <cassert>
6 #include <sstream>
7 // OpenTrep
10 #include <opentrep/bom/Filter.hpp>
12 
13 namespace OPENTREP {
14 
15  // //////////////////////////////////////////////////////////////////////
16  Filter::Filter() {
17  assert (false);
18  }
19 
20  // //////////////////////////////////////////////////////////////////////
21  Filter::Filter (const Filter& iFilter) {
22  assert (false);
23  }
24 
25  // //////////////////////////////////////////////////////////////////////
26  Filter::~Filter() {
27  }
28 
29 
38  // //////////////////////////////////////////////////////////////////////
39  bool hasGoodSize (const std::string& iWord, const NbOfLetters_T& iMinWordLength) {
40  bool hasGoodSizeFlag = true;
41  //
42  const size_t lWordLength = iWord.size();
43  if (lWordLength < iMinWordLength) {
44  hasGoodSizeFlag = false;
45  }
46  return hasGoodSizeFlag;
47  }
48 
52  // //////////////////////////////////////////////////////////////////////
53  bool isBlackListed (const std::string& iWord) {
54  // When the word is part of the "black list", it should obviously be
55  // filtered out.
56  BlackList_T::const_iterator itWord = K_BLACK_LIST.find (iWord);
57  const bool isBlackListedFlag = (itWord != K_BLACK_LIST.end());
58 
59  // DEBUG
60  // const std::string areEqualStr = (isBlackListedFlag)?"Yes":"No";
61  // const std::string& lWord = *itWord;
62  // OPENTREP_LOG_DEBUG ("Word: '" << iWord << "', black-list word: '"
63  // << lWord << "', Equals: " << areEqualStr);
64 
65  return isBlackListedFlag;
66  }
67 
71  // //////////////////////////////////////////////////////////////////////
72  void rtrim (WordList_T& ioWordList, const NbOfLetters_T& iMinWordLength) {
73  // If the list is empty, obviously nothing can be done at that stage.
74  if (ioWordList.empty() == true) {
75  return;
76  }
77 
78  // Take the first right outer word
79  WordList_T::reverse_iterator itWord = ioWordList.rbegin();
80  assert (itWord != ioWordList.rend());
81  const std::string& lWord = *itWord;
82 
83  // Check whether that word has the good size (>= iMinWordLength) and whether it is
84  // black-listed.
85  const bool hasGoodSizeFlag = hasGoodSize (lWord, iMinWordLength);
86  const bool isBlackListedFlag = isBlackListed (lWord);
87  if (hasGoodSizeFlag == false || isBlackListedFlag == true) {
88  ioWordList.erase (--itWord.base());
89  rtrim (ioWordList, iMinWordLength);
90  }
91  }
92 
96  // //////////////////////////////////////////////////////////////////////
97  void ltrim (WordList_T& ioWordList, const NbOfLetters_T& iMinWordLength) {
98  // If the list is empty, obviously nothing can be done at that stage.
99  if (ioWordList.empty() == true) {
100  return;
101  }
102 
103  // Take the first left outer word
104  WordList_T::iterator itWord = ioWordList.begin();
105  assert (itWord != ioWordList.end());
106  const std::string& lWord = *itWord;
107 
108  // Check whether that word has the good size (>= iMinWordLength) and whether it is
109  // black-listed.
110  const bool hasGoodSizeFlag = hasGoodSize (lWord, iMinWordLength);
111  const bool isBlackListedFlag = isBlackListed (lWord);
112  if (hasGoodSizeFlag == false || isBlackListedFlag == true) {
113  ioWordList.erase (itWord);
114  ltrim (ioWordList, iMinWordLength);
115  }
116  }
117 
121  // //////////////////////////////////////////////////////////////////////
122  void trim (WordList_T& ioWordList, const NbOfLetters_T& iMinWordLength) {
123  // Trim the non-relevant left outer words
124  ltrim (ioWordList, iMinWordLength);
125 
126  // Trim the non-relevant right outer words
127  rtrim (ioWordList, iMinWordLength);
128  }
129 
130  // //////////////////////////////////////////////////////////////////////
131  void Filter::trim (std::string& ioPhrase, const NbOfLetters_T& iMinWordLength) {
132  // Create a list of words from the given phrase
133  WordList_T lWordList;
134  tokeniseStringIntoWordList (ioPhrase, lWordList);
135 
136  // Trim the non-relevant left and right outer words
137  OPENTREP::trim (lWordList, iMinWordLength);
138 
139  // Re-create the phrase from the (potentially altered) list of words
140  ioPhrase = createStringFromWordList (lWordList);
141  }
142 
143  // //////////////////////////////////////////////////////////////////////
144  bool Filter::shouldKeep (const std::string& iPhrase,
145  const std::string& iWord) {
146  bool isToBeKept = true;
147 
148  // If both the phrase and the word are empty, the word should obviously
149  // be filtered out.
150  if (iPhrase.empty() == true && iWord.empty() == true) {
151  isToBeKept = false;
152  return isToBeKept;
153  }
154 
155  // If the term to be added is equal to the whole phrase (e.g., 'san'),
156  // it should be kept (not filtered out). Indeed, three-letter words
157  // often correspond to IATA codes, and should obviously be kept for
158  // indexation/searching.
159  if (iPhrase == iWord) {
160  return isToBeKept;
161  }
162 
163  // Now, the word is part of the phrase, and not equal to it (and not empty).
164 
165  // If the word has no more than two letters (e.g., 'de'), it should be
166  // filtered out. Indeed, when 'de' is part of 'charles de gaulle',
167  // for instance, it should not be indexed/searched alone (in a search,
168  // the resulting match score will be zero).
169  isToBeKept = hasGoodSize (iWord, 3);
170  if (isToBeKept == false) {
171  return isToBeKept;
172  }
173 
174  // Check whether the word is black-listed
175  isToBeKept = !isBlackListed (iWord);
176 
177  //
178  return isToBeKept;
179  }
180 
181 }
void ltrim(WordList_T &ioWordList, const NbOfLetters_T &iMinWordLength)
Definition: Filter.cpp:97
bool isBlackListed(const std::string &iWord)
Definition: Filter.cpp:53
void trim(WordList_T &ioWordList, const NbOfLetters_T &iMinWordLength)
Definition: Filter.cpp:122
std::string createStringFromWordList(const WordList_T &iWordList, const unsigned short iSplitIdx, const bool iFromBeginningFlag)
Definition: Utilities.cpp:38
unsigned int NbOfLetters_T
void rtrim(WordList_T &ioWordList, const NbOfLetters_T &iMinWordLength)
Definition: Filter.cpp:72
static void trim(std::string &ioPhrase, const NbOfLetters_T &iMinWordLength=4)
Definition: Filter.cpp:131
std::list< Word_T > WordList_T
bool hasGoodSize(const std::string &iWord, const NbOfLetters_T &iMinWordLength)
Definition: Filter.cpp:39
static bool shouldKeep(const std::string &iPhrase, const std::string &iWord)
Definition: Filter.cpp:144
const BlackList_T K_BLACK_LIST
Definition: BasConst.cpp:130
void tokeniseStringIntoWordList(const std::string &iPhrase, WordList_T &ioWordList)
Definition: Utilities.cpp:16