Main Page | Namespace List | Class Hierarchy | Class List | File List | Namespace Members | Class Members | File Members | Related Pages

indri::api::IndexEnvironment Class Reference

Principal class for interacting with Indri indexes during index construction. Provides the API for opening or creating an index and its associated repository, setting indexing and text parsing parameters, and adding documents to the repository. More...

#include <IndexEnvironment.hpp>

List of all members.

Public Member Functions

 IndexEnvironment ()
 ~IndexEnvironment ()
void setOffsetAnnotationsPath (const std::string &offsetAnnotationsRoot)
void setOffsetMetadataPath (const std::string &offsetMetadataRoot)
void setAnchorTextPath (const std::string &anchorTextRoot)
void setDocumentRoot (const std::string &documentRoot)
void addFileClass (const std::string &name, const std::string &iterator, const std::string &parser, const std::string &tokenizer, const std::string &startDocTag, const std::string &endDocTag, const std::string &endMetadataTag, const std::vector< std::string > &include, const std::vector< std::string > &exclude, const std::vector< std::string > &index, const std::vector< std::string > &metadata, const std::map< indri::parse::ConflationPattern *, std::string > &conflations)
indri::parse::FileClassEnvironmentFactory::SpecificationgetFileClassSpec (const std::string &name)
void addFileClass (const indri::parse::FileClassEnvironmentFactory::Specification &spec)
void setIndexedFields (const std::vector< std::string > &fieldNames)
void setNumericField (const std::string &fieldName, bool isNumeric, const std::string &parserName="")
void setOrdinalField (const std::string &fieldName, bool isOrdinal)
void setParentalField (const std::string &fieldName, bool isParental)
void setMetadataIndexedFields (const std::vector< std::string > &forwardFieldNames, const std::vector< std::string > &backwardFieldNames)
void setStopwords (const std::vector< std::string > &stopwords)
void setStemmer (const std::string &stemmer)
void setMemory (UINT64 memory)
void setNormalization (bool flag)
void setStoreDocs (bool flag)
void setOffsetAnnotationIndexHint (indri::parse::OffsetAnnotationIndexHint hintType)
void create (const std::string &repositoryPath, IndexStatus *callback=0)
void open (const std::string &repositoryPath, IndexStatus *callback=0)
void close ()
 close the index and repository

void addFile (const std::string &fileName)
void addFile (const std::string &fileName, const std::string &fileClass)
lemur::api::DOCID_T addString (const std::string &documentString, const std::string &fileClass, const std::vector< indri::parse::MetadataPair > &metadata)
lemur::api::DOCID_T addString (const std::string &documentString, const std::string &fileClass, const std::vector< indri::parse::MetadataPair > &metadata, const std::vector< indri::parse::TagExtent * > &tags)
lemur::api::DOCID_T addParsedDocument (ParsedDocument *document)
void deleteDocument (lemur::api::DOCID_T documentID)
int documentsIndexed ()
 Returns the number of documents indexed so far in this session.

int documentsSeen ()
void compact ()

Static Public Member Functions

void merge (const std::string &outputIndex, const std::vector< std::string > &inputIndexes)

Private Member Functions

void _getParsingContext (indri::parse::Parser **parser, indri::parse::Tokenizer **tokenizer, indri::parse::DocumentIterator **iterator, indri::parse::Conflater **conflater, const std::string &extension)
std::vector< indri::parse::Transformation * > _createAnnotators (const std::string &fileName, const std::string &fileClass, indri::parse::Conflater **conflater)
ParsedDocument_applyAnnotators (std::vector< indri::parse::Transformation * > &annotators, ParsedDocument *parsed)

Private Attributes

IndexStatus_callback
Parameters_options
std::string _repositoryPath
indri::collection::Repository _repository
int _documents
std::string _error
std::string _offsetAnnotationsRoot
std::string _offsetMetadataRoot
std::string _anchorTextRoot
std::string _documentRoot
Parameters _parameters
indri::parse::FileClassEnvironmentFactory _fileClassFactory
indri::parse::AnchorTextAnnotator _annotator
indri::parse::OffsetAnnotationAnnotator _oa_annotator
indri::parse::OffsetMetadataAnnotator _om_annotator
std::map< std::string, indri::parse::FileClassEnvironment * > _environments
int _documentsIndexed
int _documentsSeen

Friends

class QueryEnvironment


Detailed Description

Principal class for interacting with Indri indexes during index construction. Provides the API for opening or creating an index and its associated repository, setting indexing and text parsing parameters, and adding documents to the repository.


Constructor & Destructor Documentation

indri::api::IndexEnvironment::IndexEnvironment  ) 
 

indri::api::IndexEnvironment::~IndexEnvironment  ) 
 


Member Function Documentation

indri::api::ParsedDocument * indri::api::IndexEnvironment::_applyAnnotators std::vector< indri::parse::Transformation * > &  annotators,
ParsedDocument parsed
[private]
 

std::vector< indri::parse::Transformation * > indri::api::IndexEnvironment::_createAnnotators const std::string &  fileName,
const std::string &  fileClass,
indri::parse::Conflater **  conflater
[private]
 

void indri::api::IndexEnvironment::_getParsingContext indri::parse::Parser **  parser,
indri::parse::Tokenizer **  tokenizer,
indri::parse::DocumentIterator **  iterator,
indri::parse::Conflater **  conflater,
const std::string &  extension
[private]
 

void indri::api::IndexEnvironment::addFile const std::string &  fileName,
const std::string &  fileClass
 

add a file of the specified file class to the index and repository

Parameters:
fileName the file to add
fileClass the file class to add (eg trecweb).

void indri::api::IndexEnvironment::addFile const std::string &  fileName  ) 
 

Add the text in a file to the index and repository. The fileClass of this file will be chosen based on the file extension. If the file has no extension, it will be skipped. Information about indexing progress will be passed to the callback.

See also:
setCallback()
Parameters:
fileName the file to add

void indri::api::IndexEnvironment::addFileClass const indri::parse::FileClassEnvironmentFactory::Specification spec  )  [inline]
 

Add a file class.

Parameters:
spec The file class to add.

void indri::api::IndexEnvironment::addFileClass const std::string &  name,
const std::string &  iterator,
const std::string &  parser,
const std::string &  tokenizer,
const std::string &  startDocTag,
const std::string &  endDocTag,
const std::string &  endMetadataTag,
const std::vector< std::string > &  include,
const std::vector< std::string > &  exclude,
const std::vector< std::string > &  index,
const std::vector< std::string > &  metadata,
const std::map< indri::parse::ConflationPattern *, std::string > &  conflations
 

Add parsing information for a file class. Data for these parameters is passed into the FileClassEnvironmentFactory

Parameters:
name name of this file class, eg trecweb
iterator document iterator for this file class
parser document parser for this file class
tokenizer document tokenizer for this file class
startDocTag tag indicating start of a document
endDocTag tag indicating the end of a document
endMetadataTag tag indicating the end of the metadata fields
include default tags whose contents should be included in the index
exclude tags whose contents should be excluded from the index
index tags that should be forwarded to the index for tag extents
metadata tags whose contents should be indexed as metadata
conflations tags that should be conflated

lemur::api::DOCID_T indri::api::IndexEnvironment::addParsedDocument ParsedDocument document  ) 
 

add an already parsed document to the index and repository

Parameters:
document the document to add

lemur::api::DOCID_T indri::api::IndexEnvironment::addString const std::string &  documentString,
const std::string &  fileClass,
const std::vector< indri::parse::MetadataPair > &  metadata,
const std::vector< indri::parse::TagExtent * > &  tags
 

Adds a string to the index and repository. The documentString is assumed to contain the kind of text that would be found in a file of type fileClass.

Parameters:
documentString the document to add
fileClass the file class to add (eg trecweb).
metadata the metadata pairs associated with the string.
tags offset annotations to be indexed as field data. The begin and end values of each TagExtent should specify byte (not character or token) offsets within the document string. These byte offsets are converted to token offsets after document string parsing.

lemur::api::DOCID_T indri::api::IndexEnvironment::addString const std::string &  documentString,
const std::string &  fileClass,
const std::vector< indri::parse::MetadataPair > &  metadata
 

Adds a string to the index and repository. The documentString is assumed to contain the kind of text that would be found in a file of type fileClass.

Parameters:
documentString the document to add
fileClass the file class to add (eg trecweb).
metadata the metadata pairs associated with the string.

void indri::api::IndexEnvironment::close  ) 
 

close the index and repository

void indri::api::IndexEnvironment::compact  ) 
 

Permanently deletes information for documents that have been deleted from the index and reclaims used disk space.

void indri::api::IndexEnvironment::create const std::string &  repositoryPath,
IndexStatus callback = 0
 

create a new index and repository

Parameters:
repositoryPath the path to the repository
callback IndexStatus object to be notified of indexing progress.

void indri::api::IndexEnvironment::deleteDocument lemur::api::DOCID_T  documentID  ) 
 

Delete an existing document.

Parameters:
documentID The document to delete.

int indri::api::IndexEnvironment::documentsIndexed  ) 
 

Returns the number of documents indexed so far in this session.

int indri::api::IndexEnvironment::documentsSeen  ) 
 

Returns the number of documents considered for indexing, which is the sum of the documents indexed and the documents skipped.

indri::parse::FileClassEnvironmentFactory::Specification* indri::api::IndexEnvironment::getFileClassSpec const std::string &  name  )  [inline]
 

Get a named file class.

Parameters:
name The name of the file class to retrieve.

void indri::api::IndexEnvironment::merge const std::string &  outputIndex,
const std::vector< std::string > &  inputIndexes
[static]
 

Merges the contents of the indexes referenced in the inputIndexes list and creates a new index called outputIndex. The final index is compacted (contains no information about deleted documents).

Parameters:
outputIndex The pathname to the index to create (should not exist yet).
inputIndexes The pathnames to indexes to merge. These indexes should not currently be open.

void indri::api::IndexEnvironment::open const std::string &  repositoryPath,
IndexStatus callback = 0
 

open an existing index and repository

Parameters:
repositoryPath the path to the repository
callback IndexStatus object to be notified of indexing progress.

void indri::api::IndexEnvironment::setAnchorTextPath const std::string &  anchorTextRoot  ) 
 

Set anchor text root path.

Parameters:
anchorTextRoot path to anchor text root.

void indri::api::IndexEnvironment::setDocumentRoot const std::string &  documentRoot  ) 
 

Set the document root path

Parameters:
documentRoot path to document root.

void indri::api::IndexEnvironment::setIndexedFields const std::vector< std::string > &  fieldNames  ) 
 

Set names of fields to be indexed. This call indicates to the index that information about these fields should be stored in the index so they can be used in queries. This does not affect whether or not the text in a particular field is stored in an index.

See also:
addFileClass
Parameters:
fieldNames the list of fields.

void indri::api::IndexEnvironment::setMemory UINT64  memory  ) 
 

set the amount of memory to use for internal structures

Parameters:
memory the number of bytes to use.

void indri::api::IndexEnvironment::setMetadataIndexedFields const std::vector< std::string > &  forwardFieldNames,
const std::vector< std::string > &  backwardFieldNames
 

Set names of metadata fields to be indexed for fast retrieval. The forward fields are indexed in a B-Tree mapping (documentID, metadataValue). If a field is not forward indexed, the documentMetadata calls will still work, but they will be slower (the document has to be retrieved, decompressed and parsed to get the metadata back, instead of just a B-Tree lookup). The backward indexed fields store a mapping of (metadataValue, documentID). If a field is not backward indexed, the documentIDsFromMetadata and documentFromMetadata calls will not work.

Parameters:
forwardFieldNames the list of fields to forward index.
backwardFieldNames the list of fields to backward index.

void indri::api::IndexEnvironment::setNormalization bool  flag  ) 
 

set normalization of case and some punctuation; default is true (normalize during indexing and at query time)

Parameters:
flag True, if text should be normalized, false otherwise.

void indri::api::IndexEnvironment::setNumericField const std::string &  fieldName,
bool  isNumeric,
const std::string &  parserName = ""
 

Set the numeric property of a field.

Parameters:
fieldName the field.
isNumeric true if the field is a numeric field, false if not.
parserName The name of the Transformation to use to compute the numeric value of the field. Repository currently recognizes the name NumericFieldAnnotator.

void indri::api::IndexEnvironment::setOffsetAnnotationIndexHint indri::parse::OffsetAnnotationIndexHint  hintType  ) 
 

provides the indexer with the hint strategy to use for speed optimizations for indexing offset annotations

Parameters:
hintType the int type (of OffsetAnnotationIndexHint enum type)

void indri::api::IndexEnvironment::setOffsetAnnotationsPath const std::string &  offsetAnnotationsRoot  ) 
 

Set offset annotations root path.

Parameters:
offsetAnnotationsRoot path to offset annotations root.

void indri::api::IndexEnvironment::setOffsetMetadataPath const std::string &  offsetMetadataRoot  ) 
 

Set offset metadata root path.

Parameters:
offsetMetadataRoot path to offset metadata root.

void indri::api::IndexEnvironment::setOrdinalField const std::string &  fieldName,
bool  isOrdinal
 

Set the ordinal property of a field.

Parameters:
fieldName the field.
isOrdinal true if the field is an ordinal field, false if not.

void indri::api::IndexEnvironment::setParentalField const std::string &  fieldName,
bool  isParental
 

Set the parental property of a field.

Parameters:
fieldName the field.
isParental true if the field stores its parent, false if not

void indri::api::IndexEnvironment::setStemmer const std::string &  stemmer  ) 
 

set the stemmer to use

Parameters:
stemmer the stemmer to use. One of krovetz, porter

void indri::api::IndexEnvironment::setStopwords const std::vector< std::string > &  stopwords  ) 
 

set the list of stopwords

Parameters:
stopwords the list of stopwords

void indri::api::IndexEnvironment::setStoreDocs bool  flag  ) 
 

set storing of ParsedDocuments; default is true

Parameters:
flag true, if ParsedDocuments should be stored, false otherwise.


Friends And Related Function Documentation

friend class QueryEnvironment [friend]
 


Member Data Documentation

std::string indri::api::IndexEnvironment::_anchorTextRoot [private]
 

indri::parse::AnchorTextAnnotator indri::api::IndexEnvironment::_annotator [private]
 

IndexStatus* indri::api::IndexEnvironment::_callback [private]
 

std::string indri::api::IndexEnvironment::_documentRoot [private]
 

int indri::api::IndexEnvironment::_documents [private]
 

int indri::api::IndexEnvironment::_documentsIndexed [private]
 

int indri::api::IndexEnvironment::_documentsSeen [private]
 

std::map<std::string, indri::parse::FileClassEnvironment*> indri::api::IndexEnvironment::_environments [private]
 

std::string indri::api::IndexEnvironment::_error [private]
 

indri::parse::FileClassEnvironmentFactory indri::api::IndexEnvironment::_fileClassFactory [private]
 

indri::parse::OffsetAnnotationAnnotator indri::api::IndexEnvironment::_oa_annotator [private]
 

std::string indri::api::IndexEnvironment::_offsetAnnotationsRoot [private]
 

std::string indri::api::IndexEnvironment::_offsetMetadataRoot [private]
 

indri::parse::OffsetMetadataAnnotator indri::api::IndexEnvironment::_om_annotator [private]
 

Parameters* indri::api::IndexEnvironment::_options [private]
 

Parameters indri::api::IndexEnvironment::_parameters [private]
 

indri::collection::Repository indri::api::IndexEnvironment::_repository [private]
 

std::string indri::api::IndexEnvironment::_repositoryPath [private]
 


The documentation for this class was generated from the following files:
Generated on Tue Jun 15 11:02:59 2010 for Lemur by doxygen 1.3.4