OpenTREP Logo  0.6.0
C++ Open Travel Request Parsing Library
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
IndexBuilder.cpp
Go to the documentation of this file.
1 // //////////////////////////////////////////////////////////////////////
2 // Import section
3 // //////////////////////////////////////////////////////////////////////
4 // STL
5 #include <cassert>
6 #include <string>
7 #include <vector>
8 #include <exception>
9 // Boost
10 #include <boost/filesystem.hpp>
11 #include <boost/filesystem/fstream.hpp>
12 #include <boost/tokenizer.hpp>
13 // OpenTrep
18 #include <opentrep/bom/World.hpp>
19 #include <opentrep/bom/Place.hpp>
24 // Xapian
25 #include <xapian.h>
26 
27 namespace OPENTREP {
28 
29  // //////////////////////////////////////////////////////////////////////
30  void addToXapian (const Place& iPlace, Xapian::Document& ioDocument,
31  Xapian::WritableDatabase& ioDatabase) {
38  Xapian::TermGenerator lTermGenerator;
39  lTermGenerator.set_database (ioDatabase);
40  lTermGenerator.set_document (ioDocument);
41 
42  // DEBUG
43  // OPENTREP_LOG_DEBUG ("Indexing for " << iPlace.describeKey());
44 
45  const Place::TermSetMap_T& lTermSetMap = iPlace.getTermSetMap();
46  for (Place::TermSetMap_T::const_iterator itStringSet = lTermSetMap.begin();
47  itStringSet != lTermSetMap.end(); ++itStringSet) {
48  // Retrieve the weight
49  const Weight_T& lWeight = itStringSet->first;
50  const Xapian::termcount lWDFInc =
51  static_cast<const Xapian::termcount> (lWeight);
52 
53  // Retrieve the set of strings for that weight
54  const Place::StringSet_T& lTermSet = itStringSet->second;
55  for (Place::StringSet_T::const_iterator itString = lTermSet.begin();
56  itString != lTermSet.end(); ++itString) {
57  const std::string& lString = *itString;
58  lTermGenerator.index_text (lString, lWDFInc);
59 
60  // DEBUG
61  //OPENTREP_LOG_DEBUG("[" << lWeight << "/" << lWDFInc << "] "<< lString);
62  }
63  }
64 
65  // Spelling terms
66  const Place::StringSet_T& lSpellingSet = iPlace.getSpellingSet();
67  for (Place::StringSet_T::const_iterator itTerm = lSpellingSet.begin();
68  itTerm != lSpellingSet.end(); ++itTerm) {
69  const std::string& lTerm = *itTerm;
70  ioDatabase.add_spelling (lTerm);
71  }
72 
73  // DEBUG
74  OPENTREP_LOG_DEBUG ("Added terms for '" << iPlace.describeKey()
75  << "': " << iPlace.describeSets()
76  << " into " << ioDocument.get_description());
77  }
78 
79  // //////////////////////////////////////////////////////////////////////
80  void IndexBuilder::addDocumentToIndex(Xapian::WritableDatabase& ioDatabase,
81  Place& ioPlace,
82  const OTransliterator& iTransliterator) {
83 
84  // Create an empty Xapian document
85  Xapian::Document lDocument;
86 
87  // Retrieve the raw data string, to be stored as is within
88  // the Xapian document
89  const RawDataString_T& lRawDataString = ioPlace.getRawDataString();
90 
91  // The Xapian document data is indeed the same as the one of the
92  // ORI-maintained list of POR (points of reference), allowing the search
93  // process to use exactly the same parser as the indexation process
94  lDocument.set_data (lRawDataString);
95 
96  // Build the (STL) sets of terms to be added to the Xapian index and
97  // spelling dictionary
98  ioPlace.buildIndexSets (iTransliterator);
99 
100  // Add the (STL) sets of terms to the Xapian index and spelling dictionary
101  addToXapian (ioPlace, lDocument, ioDatabase);
102 
103  // Add the document to the database
104  const Xapian::docid& lDocID = ioDatabase.add_document (lDocument);
105 
106  // Assign back the newly generated Xapian document ID to the
107  // Place object
108  ioPlace.setDocID (lDocID);
109  }
110 
111  // //////////////////////////////////////////////////////////////////////
112  NbOfDBEntries_T IndexBuilder::
113  buildSearchIndex (const PORFilePath_T& iPORFilePath,
114  const TravelDBFilePath_T& iTravelDBFilePath,
115  const OTransliterator& iTransliterator) {
116  NbOfDBEntries_T oNbOfEntries = 0;
117 
118  // Check that the directory for the Xapian database (index) exists and,
119  // if not, create it.
120  // DEBUG
121  OPENTREP_LOG_DEBUG ("The Xapian database ('" << iTravelDBFilePath
122  << "') will be cleared");
123  boost::filesystem::path lTravelDBFilePath (iTravelDBFilePath.begin(),
124  iTravelDBFilePath.end());
125  boost::filesystem::remove_all (lTravelDBFilePath);
126  boost::filesystem::create_directories (lTravelDBFilePath);
127 
128  // Check whether the just created directory exists and is a directory.
129  if (!(boost::filesystem::exists (lTravelDBFilePath)
130  && boost::filesystem::is_directory (lTravelDBFilePath))) {
131  std::ostringstream oStr;
132  oStr << "The file-path to the Xapian database/index ('"
133  << iPORFilePath << "') does not exist or is not a directory.";
134  OPENTREP_LOG_ERROR (oStr.str());
135  throw FileNotFoundException (oStr.str());
136  }
137 
138  // Create the Xapian database (index). As the directory has been fully
139  // cleaned, deleted and re-created, that Xapian database (index) is empty.
140  Xapian::WritableDatabase lDatabase (iTravelDBFilePath, Xapian::DB_CREATE);
141 
142  // DEBUG
143  OPENTREP_LOG_DEBUG ("The Xapian database ('" << iTravelDBFilePath
144  << "') has been checked and open");
145 
154  lDatabase.begin_transaction();
155 
156  // DEBUG
157  OPENTREP_LOG_DEBUG ("A transaction has begun on the Xapian database ('"
158  << iTravelDBFilePath << "')");
159 
164  // DEBUG
165  OPENTREP_LOG_DEBUG ("Parsing por input file: " << iPORFilePath);
166 
167  // Check whether the file to be parsed exists and is readable.
168  boost::filesystem::path lPORFilePath (iPORFilePath.begin(),
169  iPORFilePath.end());
170  if (!(boost::filesystem::exists (lPORFilePath)
171  && boost::filesystem::is_regular_file (lPORFilePath))) {
172  OPENTREP_LOG_ERROR ("The POR file " << iPORFilePath
173  << " does not exist or cannot be open." << std::endl);
174 
175  throw FileNotFoundException ("The POR file " + iPORFilePath
176  + " does not exist or cannot be read");
177  }
178 
179  // Open the file to be parsed
180  boost::filesystem::ifstream fileToBeParsed (lPORFilePath);
181  Place& lPlace = FacPlace::instance().create();
182  std::string itReadLine;
183  while (std::getline (fileToBeParsed, itReadLine)) {
184  // Initialise the parser
185  PORStringParser lStringParser (itReadLine);
186 
187  // Parse the string
188  const Location& lLocation = lStringParser.generateLocation();
189  //const LocationKey& lLocationKey = lLocation.getKey();
190 
191  // DEBUG
192  //OPENTREP_LOG_DEBUG ("[BEF-ADD] " << lLocationKey);
193 
194  // When the line/string was relevant, create a BOM instance from
195  // the Location structure.
196  if (!(lLocation.getCommonName() == "NotAvailable")) {
197  // Fill the Place object with the Location structure.
198  lPlace.setLocation (lLocation);
199 
200  // Add the document, associated to the Place object, to the Xapian index
201  IndexBuilder::addDocumentToIndex (lDatabase, lPlace, iTransliterator);
202 
203  // DEBUG
204  /*
205  OPENTREP_LOG_DEBUG ("[AFT-ADD] " << lLocationKey
206  << ", Place: " << lPlace);
207  */
208 
209  // Iteration
210  ++oNbOfEntries;
211 
212  // DEBUG
213  OPENTREP_LOG_DEBUG ("[" << oNbOfEntries << "] " << lPlace);
214 
215  // Reset for next turn
216  lPlace.resetMatrix();
217  lPlace.resetIndexSets();
218  }
219  }
220 
221  // Commit the pending modifications on the Xapian database (index)
222  lDatabase.commit_transaction();
223 
224  // DEBUG
225  OPENTREP_LOG_DEBUG ("Xapian has indexed " << oNbOfEntries << " entries.");
226 
234  lDatabase.close();
235 
236  return oNbOfEntries;
237  }
238 
239 }
#define OPENTREP_LOG_ERROR(iToBeLogged)
Definition: Logger.hpp:23
std::set< std::string > StringSet_T
Definition: Place.hpp:39
#define OPENTREP_LOG_DEBUG(iToBeLogged)
Definition: Logger.hpp:32
static FacPlace & instance()
Definition: FacPlace.cpp:29
unsigned short Weight_T
unsigned int NbOfDBEntries_T
const TermSetMap_T & getTermSetMap() const
Definition: Place.hpp:466
void addToXapian(const Place &iPlace, Xapian::Document &ioDocument, Xapian::WritableDatabase &ioDatabase)
std::map< const Weight_T, StringSet_T > TermSetMap_T
Definition: Place.hpp:40
Class modelling a place/POR (point of reference).
Definition: Place.hpp:28
std::string describeKey() const
Definition: Place.hpp:1002
const StringSet_T & getSpellingSet() const
Definition: Place.hpp:480
Place & create()
Definition: FacPlace.cpp:41
std::string describeSets() const
Definition: Place.cpp:155