Skip to content

Commit

Permalink
Merge pull request kaustubh-pandey#10 from wasim919/master
Browse files Browse the repository at this point in the history
Bm25f similarity done.
  • Loading branch information
wasim919 authored Dec 7, 2018
2 parents d3e4be2 + 9e9b51b commit fa1fb9a
Show file tree
Hide file tree
Showing 12 changed files with 421 additions and 1 deletion.
1 change: 1 addition & 0 deletions bin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/lucene_read_file/
Binary file modified bin/lucene_read_file/LuceneReadIndexFromFileExample.class
Binary file not shown.
Binary file added indexedFiles/_0.cfe
Binary file not shown.
Binary file renamed indexedFiles/_2.cfs → indexedFiles/_0.cfs
Binary file not shown.
Binary file added indexedFiles/_0.si
Binary file not shown.
Binary file removed indexedFiles/_2.cfe
Binary file not shown.
Binary file removed indexedFiles/_2.si
Binary file not shown.
Binary file added indexedFiles/segments_1
Binary file not shown.
Binary file removed indexedFiles/segments_3
Binary file not shown.
238 changes: 238 additions & 0 deletions src/lucene_read_file/LuceneReadIndexFromExample_new.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
package lucene_read_file;

import java.awt.List;


import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.regex.*;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.search.BlendedTermQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.PerFieldSimilarityWrapper;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import java.io.IOException;

public class LuceneReadIndexFromExample_new
{
//directory contains the lucene indexes
private static final String INDEX_DIR = "indexedFiles";

public static void main(String[] args) throws Exception
{
//Create lucene searcher. It search over a single IndexReader.
IndexSearcher searcher = createSearcher();

//Search indexed contents using search term
String queryString= "Fret 31-12-2016 WASHINGTON";
ArrayList<String> extractedDates=getDates(queryString);
ArrayList<String> processedDates=processDates(extractedDates);
ArrayList<TopDocs> foundDateDocs=new ArrayList<TopDocs>();
// for(int i=0;i<processedDates.size();i++){
// System.out.println("Date:"+processedDates.get(i)+"::");
// TopDocs foundDocs = searchInDate(processedDates.get(i).toString(), searcher);
// foundDateDocs.add(foundDocs);
// }
// //System.out.println(foundDateDocs.size());
// for(int i=0;i<foundDateDocs.size();i++){
//
// TopDocs foundDocs=foundDateDocs.get(i);
// //System.out.println(foundDocs.scoreDocs.length);
// for(int j=0;j<foundDocs.scoreDocs.length;j++){
// System.out.print(foundDocs.scoreDocs[j].doc+" ");
// }
// }
System.out.println("-----------------");
// BlendedTermQuery bm25fQuery = new BlendedTermQuery.Builder()
// .add(new Term("title", "moby"), 2.0f)
// .add(new Term("description", "moby"), 4.0f)
// .setRewriteMethod(BlendedTermQuery.BOOLEAN_REWRITE)
// .build();
BlendedTermQuery bm25fQuery = new BlendedTermQuery.Builder()
.add(new Term("date", "2016-12-31"))
.add(new Term("body", "Trump"))
.add(new Term("title", "Winter"))
.setRewriteMethod(BlendedTermQuery.BOOLEAN_REWRITE)
.build();
System.out.println(bm25fQuery);
TopDocs docs = searcher.search(bm25fQuery, 10);
ScoreDoc[] hits = docs.scoreDocs;

System.out.println("Found " + hits.length + " hits.");
for(int i=0;i<hits.length;++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
}
}


//TopDocs foundDocs = searchInDate("2018/11/19", searcher);
// TopDocs foundDocs2= searchInBody(queryString,searcher);
// TopDocs foundDocs3=searchInTitle(queryString,searcher);
// ArrayList<Integer> arr=new ArrayList<Integer>();
// //Merging to be done
// //System.out.println("Date Match:");
//// for(int i=0;i<foundDocs.scoreDocs.length;i++){
//// System.out.print(foundDocs.scoreDocs[i].doc+" ");
//// }
// System.out.println();
// System.out.println("Body Match:");
// for(int i=0;i<foundDocs2.scoreDocs.length;i++){
// System.out.print(foundDocs2.scoreDocs[i].doc+" ");
// }
// System.out.println();
// for(int i=0;i<foundDocs3.scoreDocs.length;i++){
// System.out.print(foundDocs3.scoreDocs[i].doc+" ");
// }
// System.out.println();
//// for(int i=0;i<foundDocs.scoreDocs.length;i++){
//// for(int j=0;j<foundDocs2.scoreDocs.length;j++){
//// if(foundDocs.scoreDocs[i].doc==foundDocs2.scoreDocs[j].doc){
//// arr.add(foundDocs.scoreDocs[i].doc);
//// }
//// }
////
//// }
// System.out.println("Common Docs :: ");
// for(int k=0;k<arr.size();k++){
// System.out.println(arr.get(k));
// }
// //Total found documents
// long r=0;
// for(int i=0;i<foundDateDocs.size();i++){
// r=foundDateDocs.get(i).totalHits;
// System.out.println("Total Results Date matches :: " +r);
// }
//
// System.out.println("Total Results Body matches:: " + foundDocs2.totalHits);
// System.out.println("Total Results Title matches:: " + foundDocs3.totalHits);
//
// //Let's print out the path of files which have searched term
//// for (ScoreDoc sd : foundDocs.scoreDocs)
//// {
//// Document d = searcher.doc(sd.doc);
//// System.out.println("Path : "+ d.get("path") +"Date: "+d.get("date")+ ", Score : " + sd.score);
//// }
// for (ScoreDoc sd : foundDocs2.scoreDocs)
// {
// Document d = searcher.doc(sd.doc);
// System.out.println("Path : "+ d.get("path") +"Date: "+d.get("date")+ ", Score : " + sd.score);
// }


// public Document getDocument(ScoreDoc scoreDoc)
// throws IOException {
// return indexSearcher.doc(scoreDoc.doc);
// }
//
//To preprocess query

private static ArrayList<String> getDates(String queryString){
Matcher m = Pattern.compile("(\\d{4}/\\d{2}/\\d{2}|\\d{2}/\\d{2}/\\d{4}|\\d{2}-\\d{2}-\\d{4}|\\d{4}-\\d{2}-\\d{2})",
Pattern.CASE_INSENSITIVE).matcher(queryString);
ArrayList<String> mydates=new ArrayList<String>();
while (m.find()) {
//System.out.println(m.group(1));
mydates.add(m.group(1));
}
return mydates;
}

private static String convertDate(String date){
String returnDate=date;
if(date.charAt(2)=='/'){
String newDate[]=date.split("/");
returnDate=newDate[2]+"-"+newDate[1]+"-"+newDate[0];
}
else if(date.charAt(2)=='-'){
String newDate[]=date.split("-");
returnDate=newDate[2]+"-"+newDate[1]+"-"+newDate[0];
}
else if(date.charAt(4)=='/'){
returnDate=date.replace('/', '-');
}
return returnDate;
}

private static ArrayList<String> processDates(ArrayList<String> extractedDates){
ArrayList<String> processedDates=new ArrayList<String>();
for(int i=0;i<extractedDates.size();i++){
processedDates.add(convertDate(extractedDates.get(i)));
}
return processedDates;
}
private static TopDocs searchInDate(String textToFind, IndexSearcher searcher) throws Exception
{
//Create search query
//QueryParser qp = new QueryParser("date", new StandardAnalyzer());
//Query query = qp.parse(textToFind.toString());
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("date",textToFind));
PhraseQuery pq = builder.build();
//search the index
TopDocs hits = searcher.search(pq, 10);
//System.out.println(hits.scoreDocs[0].doc);
return hits;
}

private static TopDocs searchInTitle(String textToFind, IndexSearcher searcher) throws Exception
{
//Create search query
QueryParser qp = new QueryParser("title", new StandardAnalyzer());
Query query = qp.parse(textToFind);

//search the index
TopDocs hits = searcher.search(query, 10);
System.out.println(hits.scoreDocs[0].doc);
return hits;
}
private static TopDocs searchInBody(String textToFind, IndexSearcher searcher) throws Exception
{
//Create search query
QueryParser qp = new QueryParser("body", new StandardAnalyzer());
Query query = qp.parse(textToFind);

//search the index
TopDocs hits = searcher.search(query, 10);
//System.out.println(hits.scoreDocs[0].doc);
return hits;
}

private static IndexSearcher createSearcher() throws IOException
{
Directory dir = FSDirectory.open(Paths.get(INDEX_DIR));

//It is an interface for accessing a point-in-time view of a lucene index
IndexReader reader = DirectoryReader.open(dir);

//Index searcher
IndexSearcher searcher = new IndexSearcher(reader);
return searcher;
}
}
4 changes: 3 additions & 1 deletion src/lucene_read_file/LuceneReadIndexFromFileExample.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public static void main(String[] args) throws Exception
IndexSearcher searcher = createSearcher();

//Search indexed contents using search term
String queryString= "Fret 31-12-2016 WASHINGTON";
String queryString= "Winter 2016-12-31 WASHINGTON";
ArrayList<String> extractedDates=getDates(queryString);
ArrayList<String> processedDates=processDates(extractedDates);
ArrayList<TopDocs> foundDateDocs=new ArrayList<TopDocs>();
Expand Down Expand Up @@ -180,6 +180,8 @@ private static TopDocs searchInBody(String textToFind, IndexSearcher searcher) t
Query query = qp.parse(textToFind);

//search the index
System.out.println("body");
System.out.print(query);
TopDocs hits = searcher.search(query, 10);
//System.out.println(hits.scoreDocs[0].doc);
return hits;
Expand Down
Loading

0 comments on commit fa1fb9a

Please sign in to comment.