- 浏览: 98775 次
- 性别:
- 来自: 吉林
文章分类
最新评论
lucene第二步,lucene搜索
出自:http://blog.csdn.net/wxwzy738/article/details/8799656 的整理
1、工程结构
2、查询语法代码
[java]
view plaincopy
- packageorg.itat.index;
- importjava.io.File;
- importjava.io.IOException;
- importjava.io.StringReader;
- importjava.text.ParseException;
- importjava.text.SimpleDateFormat;
- importjava.util.Date;
- importjava.util.HashMap;
- importjava.util.Map;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.TokenStream;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.document.Field;
- importorg.apache.lucene.document.NumericField;
- importorg.apache.lucene.index.CorruptIndexException;
- importorg.apache.lucene.index.IndexReader;
- importorg.apache.lucene.index.IndexWriter;
- importorg.apache.lucene.index.IndexWriterConfig;
- importorg.apache.lucene.index.Term;
- importorg.apache.lucene.queryParser.QueryParser;
- importorg.apache.lucene.search.BooleanClause.Occur;
- importorg.apache.lucene.search.BooleanQuery;
- importorg.apache.lucene.search.FuzzyQuery;
- importorg.apache.lucene.search.IndexSearcher;
- importorg.apache.lucene.search.NumericRangeQuery;
- importorg.apache.lucene.search.PhraseQuery;
- importorg.apache.lucene.search.PrefixQuery;
- importorg.apache.lucene.search.Query;
- importorg.apache.lucene.search.ScoreDoc;
- importorg.apache.lucene.search.TermQuery;
- importorg.apache.lucene.search.TermRangeQuery;
- importorg.apache.lucene.search.TopDocs;
- importorg.apache.lucene.search.WildcardQuery;
- importorg.apache.lucene.store.Directory;
- importorg.apache.lucene.store.FSDirectory;
- importorg.apache.lucene.store.LockObtainFailedException;
- importorg.apache.lucene.util.Version;
- importorg.wltea.analyzer.lucene.IKAnalyzer;
- publicclassSearcherUtil{
- privateDirectorydirectory;
- privateAnalyzeranalyzer=newIKAnalyzer();
- privateIndexReaderreader;
- privateString[]ids={"1","2","3","4","5","6"};
- privateString[]emails={"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
- privateString[]contents={
- "welcometovisitedthespace,Ilikebook",
- "helloboy,Ilikepingpengball",
- "mynameisccIlikegame",
- "Ilikefootball",
- "IlikefootballandIlikebasketballtoo",
- "Ilikemovieandswim"
- };
- privateDate[]dates=null;
- privateint[]attachs={2,3,1,4,5,5};
- privateString[]names={"zhangsan","lisi","john","jetty","mike","jake"};
- privateMap<String,Float>scores=newHashMap<String,Float>();
- publicSearcherUtil(){
- //directory=newRAMDirectory();
- try{
- directory=FSDirectory.open(newFile("F:\\Workspaces\\lucenes\\02_lucene_searcher\\index"));
- setDates();
- scores.put("itat.org",2.0f);
- scores.put("zttc.edu",1.5f);
- //index();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- privatevoidsetDates(){
- SimpleDateFormatsdf=newSimpleDateFormat("yyyy-MM-dd");
- try{
- dates=newDate[ids.length];
- dates[0]=sdf.parse("2010-02-19");
- dates[1]=sdf.parse("2012-01-11");
- dates[2]=sdf.parse("2011-09-19");
- dates[3]=sdf.parse("2010-12-22");
- dates[4]=sdf.parse("2012-01-01");
- dates[5]=sdf.parse("2011-05-19");
- }catch(ParseExceptione){
- e.printStackTrace();
- }
- }
- publicvoidindex(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- writer.deleteAll();
- Documentdoc=null;
- for(inti=0;i<ids.length;i++){
- doc=newDocument();
- doc.add(newField("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(newField("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(newField("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(newField("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- //存储数字
- doc.add(newNumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
- //存储日期
- doc.add(newNumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
- Stringet=emails[i].substring(emails[i].lastIndexOf("@")+1);
- if(scores.containsKey(et)){
- doc.setBoost(scores.get(et));
- }else{
- doc.setBoost(0.5f);
- }
- writer.addDocument(doc);
- }
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicIndexSearchergetSearcher(){
- try{
- if(reader==null){
- reader=IndexReader.open(directory);
- }else{
- IndexReadertr=IndexReader.openIfChanged(reader);
- if(tr!=null){
- reader.close();
- reader=tr;
- }
- }
- returnnewIndexSearcher(reader);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- returnnull;
- }
- publicIndexSearchergetSearcher(Directorydirectory){
- try{
- if(reader==null){
- reader=IndexReader.open(directory);
- }else{
- IndexReadertr=IndexReader.openIfChanged(reader);
- if(tr!=null){
- reader.close();
- reader=tr;
- }
- }
- returnnewIndexSearcher(reader);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- returnnull;
- }
- publicvoidsearchByTerm(Stringfield,Stringname,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- Queryquery=newTermQuery(newTerm(field,name));
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByTermToken(Stringfield,Stringname,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- //Queryquery=newTermQuery(newTerm(field,name));
- //当用户输入两个关键字时,QueryParser默认它们之间的关系为“或”关系
- //下面这么写的话在对用户输入进行扫描时,就会用空格分开的关键字理解为“与”,
- //其实也就是构建了一个“与”关系的布尔型查询
- //parser.setDefaultOperator(Operator.AND);
- QueryParserparser=newQueryParser(Version.LUCENE_35,field,analyzer);
- Stringk=analyzerKey(name);
- Queryquery=parser.parse(name);
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(Exceptione){
- e.printStackTrace();
- }
- }
- privateStringanalyzerKey(Stringkey){
- //StandardAnalyzeranalyzer=newStandardAnalyzer(Version.LUCENE_35);
- StringReaderreader=newStringReader(key);
- TokenStreamtokenStream=analyzer.tokenStream("",reader);
- CharTermAttributetermattr=tokenStream.addAttribute(CharTermAttribute.class);
- StringBuildersb=newStringBuilder();
- try{
- while(tokenStream.incrementToken()){
- Stringk=termattr.toString();
- sb.append(k).append("");
- }
- }catch(IOExceptione){
- e.printStackTrace();
- }
- key=sb.toString().trim();
- key=key.replaceAll("\\s+","AND");
- returnsb.toString();
- }
- publicvoidprintDocument(IndexSearchersearcher,TopDocstds){
- System.out.println("共查询了【"+tds.totalHits+"】条");
- for(ScoreDocsd:tds.scoreDocs){
- try{
- Documentdoc=searcher.doc(sd.doc);
- System.out.println("filename:"+doc.get("filename"));
- System.out.println("path:"+doc.get("path"));
- System.out.println("date:"+doc.get("date"));
- System.out.println("size:"+doc.get("size"));
- System.out.println("content:"+doc.get("content"));
- System.out.println("-------------------------------------------");
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidsearchByTermRange(Stringfield,Stringstart,Stringend,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- Queryquery=newTermRangeQuery(field,start,end,true,true);
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *建立索引时:使用的Field,而使用NumericRangeQuery,必须使用NumericField
- *@paramfield
- *@paramstart
- *@paramend
- *@paramnum
- */
- publicvoidsearchByNumricRange(Stringfield,intstart,intend,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- Queryquery=NumericRangeQuery.newIntRange(field,start,end,true,true);
- //DateTools.dateToString(newDate(),null);
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByPrefix(Stringfield,Stringvalue,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- Queryquery=newPrefixQuery(newTerm(field,value));
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByWildcard(Stringfield,Stringvalue,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- //在传入的value中可以使用通配符:?和*,?表示匹配一个字符,*表示匹配任意多个字符
- Queryquery=newWildcardQuery(newTerm(field,value));
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByBoolean(intnum){
- try{
- IndexSearchersearcher=getSearcher();
- BooleanQueryquery=newBooleanQuery();
- /*
- *BooleanQuery可以连接多个子查询
- *Occur.MUST表示必须出现
- *Occur.SHOULD表示可以出现
- *Occur.MUSE_NOT表示不能出现
- */
- query.add(newTermQuery(newTerm("name","3")),Occur.MUST_NOT);
- query.add(newTermQuery(newTerm("content","健壮")),Occur.SHOULD);
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByPhrase(intnum){
- try{
- IndexSearchersearcher=getSearcher();
- PhraseQueryquery=newPhraseQuery();
- query.setSlop(10);
- query.add(newTerm("content","java"));
- //第一个Term
- query.add(newTerm("content","程序"));
- //产生距离之后的第二个Term
- //query.add(newTerm("content","football"));
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *查询用于匹配与指定项相似的项
- *默认是匹配一个有不同的,其他一样的,比如like和mike,就是距离算法的相似距离为1
- *这种方式少用,影响效率
- */
- publicvoidsearchByFuzzy(intnum){
- try{
- IndexSearchersearcher=getSearcher();
- //最后两个参数为匹配率和距离
- FuzzyQueryquery=newFuzzyQuery(newTerm("content","总统"),0.4f,0);
- System.out.println(query.getPrefixLength());
- System.out.println(query.getMinSimilarity());
- TopDocstds=searcher.search(query,num);
- printDocument(searcher,tds);
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchByQueryParse(Queryquery,intnum){
- try{
- IndexSearchersearcher=getSearcher();
- TopDocstds=searcher.search(query,num);
- System.out.println("一共查询了:"+tds.totalHits);
- for(ScoreDocsd:tds.scoreDocs){
- Documentdoc=searcher.doc(sd.doc);
- System.out.println(doc.get("id")+"---->"+
- doc.get("name")+"["+doc.get("email")+"]-->"+doc.get("id")+","+
- doc.get("attach")+","+doc.get("date")+"=="+sd.score);
- }
- searcher.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *lucene3.5之前采用的是一种再查询的方式,也就是说先把全部的结果的docid查询出来,然后
- *分页得到该页的docid,然后根据docid得到document信息,
- *lucene官方是说他的速度已经够快,再查询不会有效率问题
- *@paramquery
- *@parampageIndex
- *@parampageSize
- */
- publicvoidsearchPage(Stringquery,intpageIndex,intpageSize){
- try{
- Directorydir=FileIndexUtils.getDirectory();
- IndexSearchersearcher=getSearcher(dir);
- QueryParserparser=newQueryParser(Version.LUCENE_35,"content",analyzer);
- Queryq=parser.parse(query);
- TopDocstds=searcher.search(q,500);
- ScoreDoc[]sds=tds.scoreDocs;
- intstart=(pageIndex-1)*pageSize;
- intend=pageIndex*pageSize;
- for(inti=start;i<end;i++){
- Documentdoc=searcher.doc(sds[i].doc);
- System.out.println("filename:"+doc.get("filename"));
- System.out.println("path:"+doc.get("path"));
- System.out.println("date:"+doc.get("date"));
- System.out.println("size:"+doc.get("size"));
- System.out.println("content:"+doc.get("content"));
- System.out.println("-------------------------------------------");
- }
- searcher.close();
- }catch(org.apache.lucene.queryParser.ParseExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- /**
- *目前没有办法只取当前这页的数据,而是要全部查询然后得到docid
- *一种增加效率的方式是取的条数做下限制,比如不要每次都取500条,
- *也是把取的条数设置为当前页的所在位置数,比如每页10条,
- *取第一页数据则取10条,取第二页则取20条,取五页则去50条
- *根据页码和分页大小获取上一次的最后一个ScoreDoc
- */
- privateScoreDocgetLastScoreDoc(intpageIndex,intpageSize,Queryquery,IndexSearchersearcher)throwsIOException{
- if(pageIndex==1)returnnull;//如果是第一页就返回空
- intnum=pageSize*(pageIndex-1);//获取上一页的数量
- TopDocstds=searcher.search(query,num);
- returntds.scoreDocs[num-1];
- }
- /**
- *使用这种方式的话是把上一页的最后一个元素给拿到,然后再把pagesize传入,
- *就可以得到当页的数据,其实就是简便了查询,原理还是把全部的docid查询后在得到document
- *@paramquery
- *@parampageIndex
- *@parampageSize
- */
- publicvoidsearchPageByAfter(Stringquery,intpageIndex,intpageSize){
- try{
- Directorydir=FileIndexUtils.getDirectory();
- IndexSearchersearcher=getSearcher(dir);
- QueryParserparser=newQueryParser(Version.LUCENE_35,"content",analyzer);
- Queryq=parser.parse(query);
- //先获取上一页的最后一个元素
- ScoreDoclastSd=getLastScoreDoc(pageIndex,pageSize,q,searcher);
- //通过最后一个元素搜索下页的pageSize个元素
- TopDocstds=searcher.searchAfter(lastSd,q,pageSize);
- printDocument(searcher,tds);
- searcher.close();
- }catch(org.apache.lucene.queryParser.ParseExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidsearchNoPage(Stringquery){
- try{
- Directorydir=FileIndexUtils.getDirectory();
- IndexSearchersearcher=getSearcher(dir);
- QueryParserparser=newQueryParser(Version.LUCENE_35,"content",newStandardAnalyzer(Version.LUCENE_35));
- Queryq=parser.parse(query);
- TopDocstds=searcher.search(q,20);
- ScoreDoc[]sds=tds.scoreDocs;
- for(inti=0;i<sds.length;i++){
- Documentdoc=searcher.doc(sds[i].doc);
- System.out.println(sds[i].doc+":"+doc.get("path")+"-->"+doc.get("filename"));
- }
- searcher.close();
- }catch(org.apache.lucene.queryParser.ParseExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
3、查询语法的测试单元类
[java]
view plaincopy
- packageorg.itat.test;
- importjava.io.File;
- importjava.io.IOException;
- importorg.apache.commons.io.FileUtils;
- importorg.apache.commons.io.FilenameUtils;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.queryParser.ParseException;
- importorg.apache.lucene.queryParser.QueryParser;
- importorg.apache.lucene.search.Query;
- importorg.apache.lucene.util.Version;
- importorg.itat.index.FileIndexUtils;
- importorg.itat.index.SearcherUtil;
- importorg.junit.Before;
- importorg.junit.Test;
- importorg.wltea.analyzer.lucene.IKAnalyzer;
- publicclassTestSearch{
- privateSearcherUtilsu;
- privateAnalyzeranalyzer=newIKAnalyzer();
- @Before
- publicvoidinit(){
- su=newSearcherUtil();
- }
- @Test
- publicvoidtestCopyFiles(){
- try{
- Filefile=newFile("F:\\Workspaces\\lucenes\\02_lucene_searcher\\resource");
- for(Filef:file.listFiles()){
- StringdestFileName=FilenameUtils.getFullPath(f.getAbsolutePath())+
- FilenameUtils.getBaseName(f.getName())+".she";
- FileUtils.copyFile(f,newFile(destFileName));
- }
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- @Test
- publicvoidsearchByTerm(){
- //su.searchByTerm("content","",10);
- su.searchByTermToken("content","头脑风暴",10);
- }
- @Test
- publicvoidsearchByTermRange(){
- //查询name以a开头和s结尾的
- //su.searchByTermRange("name","a","s",10);
- //由于attachs是数字类型,使用TermRange无法查询
- //su.searchByTermRange("size",newNumericField("200").stringValue(),newNumericField("500").stringValue(),10);
- QueryParserparser=newQueryParser(Version.LUCENE_35,"size",analyzer);
- Queryquery;
- try{
- query=parser.parse("size:[100TO500]");
- su.searchByQueryParse(query,10);
- }catch(ParseExceptione){
- e.printStackTrace();
- }
- }
- @Test
- publicvoidsearchByNumRange(){
- //su.searchByNumricRange("attach",2,10,5);
- su.searchByNumricRange("size",100,300,10);
- }
- @Test
- publicvoidsearchByPrefix(){
- su.searchByPrefix("content","人",10);
- }
- @Test
- publicvoidsearchByWildcard(){
- //匹配@itat.org结尾的所有字符
- //su.searchByWildcard("email","*@itat.org",10);
- //匹配j开头的有三个字符的name
- //su.searchByWildcard("name","j???",10);
- su.searchByWildcard("content","类?",10);
- }
- @Test
- publicvoidsearchByBoolean(){
- su.searchByBoolean(10);
- }
- @Test
- publicvoidsearchByPhrase(){
- su.searchByPhrase(10);
- }
- @Test
- publicvoidsearchByFuzzy(){
- su.searchByFuzzy(10);
- }
- @Test
- publicvoidsearchByQueryParse()throwsParseException{
- //1、创建QueryParser对象,默认搜索域为content
- QueryParserparser=newQueryParser(Version.LUCENE_35,"content",newStandardAnalyzer(Version.LUCENE_35));
- //改变空格的默认操作符,以下可以改成AND
- //parser.setDefaultOperator(Operator.AND);
- //开启第一个字符的通配符匹配,默认关闭因为效率不高
- parser.setAllowLeadingWildcard(true);
- //搜索content中包含有like的
- Queryquery=parser.parse("like");
- //有basketball或者football的,空格默认就是OR
- query=parser.parse("basketballfootball");
- //改变搜索域为name为mike
- //query=parser.parse("content:like");
- //同样可以使用*和?来进行通配符匹配
- //query=parser.parse("name:j*");
- //通配符默认不能放在首位
- //query=parser.parse("email:*@itat.org");
- //匹配name中没有mike但是content中必须有football的,+和-要放置到域说明前面
- query=parser.parse("-name:mike+like");
- //匹配一个区间,注意:TO必须是大写
- //query=parser.parse("id:[1TO6]");
- //闭区间匹配只会匹配到2
- //query=parser.parse("id:{1TO3}");
- //完全匹配ILikeFootball的
- //query=parser.parse("\"Ilikefootball\"");
- //匹配I和football之间有一个单词距离的
- //query=parser.parse("\"Ifootball\"~1");
- //模糊查询
- //query=parser.parse("name:make~");
- //没有办法匹配数字范围(自己扩展Parser)
- //query=parser.parse("attach:[2TO10]");
- su.searchByQueryParse(query,10);
- }
- @Test
- publicvoidindexFile(){
- FileIndexUtils.index(true);
- }
- @Test
- publicvoidtestSearchPage01(){
- su.searchPage("java",2,5);
- System.out.println("-------------------------------");
- //su.searchNoPage("java");
- su.searchPageByAfter("java",2,2);
- }
- @Test
- publicvoidtestSearchPage02(){
- su.searchPageByAfter("java",3,20);
- }
- }
4、创建索引的类
[java]
view plaincopy
- packageorg.itat.index;
- importjava.io.File;
- importjava.io.FileReader;
- importjava.io.IOException;
- importorg.apache.commons.io.FileUtils;
- importorg.apache.lucene.analysis.Analyzer;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.document.Field;
- importorg.apache.lucene.document.NumericField;
- importorg.apache.lucene.index.CorruptIndexException;
- importorg.apache.lucene.index.IndexWriter;
- importorg.apache.lucene.index.IndexWriterConfig;
- importorg.apache.lucene.store.Directory;
- importorg.apache.lucene.store.FSDirectory;
- importorg.apache.lucene.store.LockObtainFailedException;
- importorg.apache.lucene.util.Version;
- importorg.wltea.analyzer.lucene.IKAnalyzer;
- publicclassFileIndexUtils{
- privatestaticDirectorydirectory=null;
- privatestaticAnalyzeranalyzer=newIKAnalyzer();
- static{
- try{
- directory=FSDirectory.open(newFile("F:\\Workspaces\\lucenes\\02_lucene_searcher\\index"));
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicstaticDirectorygetDirectory(){
- returndirectory;
- }
- publicstaticvoidindex(booleanhasNew){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,newIndexWriterConfig(Version.LUCENE_35,analyzer));
- if(hasNew){
- writer.deleteAll();
- }
- Filefile=newFile("F:\\Workspaces\\lucenes\\02_lucene_searcher\\resource");
- Documentdoc=null;
- for(Filef:file.listFiles()){
- doc=newDocument();
- doc.add(newField("content",FileUtils.readFileToString(f),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(newField("filename",f.getName(),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(newField("path",f.getAbsolutePath(),Field.Store.YES,Field.Index.ANALYZED));
- doc.add(newNumericField("date",Field.Store.YES,true).setLongValue(f.lastModified()));
- doc.add(newNumericField("size",Field.Store.YES,true).setIntValue((int)(f.length())));
- writer.addDocument(doc);
- }
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- }
5、对索引进行操作的类
[java]
view plaincopy
- packageorg.itat.index;
- importjava.io.IOException;
- importjava.text.ParseException;
- importjava.text.SimpleDateFormat;
- importjava.util.Date;
- importjava.util.HashMap;
- importjava.util.Map;
- importorg.apache.lucene.analysis.standard.StandardAnalyzer;
- importorg.apache.lucene.document.Document;
- importorg.apache.lucene.document.Field;
- importorg.apache.lucene.document.NumericField;
- importorg.apache.lucene.index.CorruptIndexException;
- importorg.apache.lucene.index.IndexReader;
- importorg.apache.lucene.index.IndexWriter;
- importorg.apache.lucene.index.IndexWriterConfig;
- importorg.apache.lucene.index.StaleReaderException;
- importorg.apache.lucene.index.Term;
- importorg.apache.lucene.store.Directory;
- importorg.apache.lucene.store.LockObtainFailedException;
- importorg.apache.lucene.store.RAMDirectory;
- importorg.apache.lucene.util.Version;
- publicclassIndexUtil{
- privateString[]ids={"1","2","3","4","5","6"};
- privateString[]emails={"aa@itat.org","bb@itat.org","cc@cc.org","dd@sina.org","ee@zttc.edu","ff@itat.org"};
- privateString[]contents={
- "welcometovisitedthespace,Ilikebook",
- "helloboy,Ilikepingpengball",
- "mynameisccIlikegame",
- "Ilikefootball",
- "IlikefootballandIlikebasketballtoo",
- "Ilikemovieandswim"
- };
- privateDate[]dates=null;
- privateint[]attachs={2,3,1,4,5,5};
- privateString[]names={"zhangsan","lisi","john","jetty","mike","jake"};
- privateDirectorydirectory=null;
- privateMap<String,Float>scores=newHashMap<String,Float>();
- publicIndexUtil(){
- setDates();
- scores.put("itat.org",2.0f);
- scores.put("zttc.edu",1.5f);
- directory=newRAMDirectory();
- index();
- }
- privatevoidsetDates(){
- SimpleDateFormatsdf=newSimpleDateFormat("yyyy-MM-dd");
- try{
- dates=newDate[ids.length];
- dates[0]=sdf.parse("2010-02-19");
- dates[1]=sdf.parse("2012-01-11");
- dates[2]=sdf.parse("2011-09-19");
- dates[3]=sdf.parse("2010-12-22");
- dates[4]=sdf.parse("2012-01-01");
- dates[5]=sdf.parse("2011-05-19");
- }catch(ParseExceptione){
- e.printStackTrace();
- }
- }
- publicvoidundelete(){
- //使用IndexReader进行恢复
- try{
- IndexReaderreader=IndexReader.open(directory,false);
- //恢复时,必须把IndexReader的只读(readOnly)设置为false
- reader.undeleteAll();
- reader.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(StaleReaderExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidmerge(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- //会将索引合并为2段,这两段中的被删除的数据会被清空
- //特别注意:此处Lucene在3.5之后不建议使用,因为会消耗大量的开销,
- //Lucene会根据情况自动处理的
- writer.forceMerge(2);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidforceDelete(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- writer.forceMergeDeletes();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoiddelete(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- //参数是一个选项,可以是一个Query,也可以是一个term,term是一个精确查找的值
- //此时删除的文档并不会被完全删除,而是存储在一个回收站中的,可以恢复
- writer.deleteDocuments(newTerm("id","1"));
- writer.commit();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidupdate(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,
- newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- /*
- *Lucene并没有提供更新,这里的更新操作其实是如下两个操作的合集
- *先删除之后再添加
- */
- Documentdoc=newDocument();
- doc.add(newField("id","11",Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(newField("email",emails[0],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(newField("content",contents[0],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(newField("name",names[0],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- writer.updateDocument(newTerm("id","1"),doc);
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- publicvoidquery(){
- try{
- IndexReaderreader=IndexReader.open(directory);
- //通过reader可以有效的获取到文档的数量
- System.out.println("numDocs:"+reader.numDocs());
- System.out.println("maxDocs:"+reader.maxDoc());
- System.out.println("deleteDocs:"+reader.numDeletedDocs());
- reader.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- publicvoidindex(){
- IndexWriterwriter=null;
- try{
- writer=newIndexWriter(directory,newIndexWriterConfig(Version.LUCENE_35,newStandardAnalyzer(Version.LUCENE_35)));
- writer.deleteAll();
- Documentdoc=null;
- for(inti=0;i<ids.length;i++){
- doc=newDocument();
- doc.add(newField("id",ids[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- doc.add(newField("email",emails[i],Field.Store.YES,Field.Index.NOT_ANALYZED));
- doc.add(newField("content",contents[i],Field.Store.NO,Field.Index.ANALYZED));
- doc.add(newField("name",names[i],Field.Store.YES,Field.Index.NOT_ANALYZED_NO_NORMS));
- //存储数字
- doc.add(newNumericField("attach",Field.Store.YES,true).setIntValue(attachs[i]));
- //存储日期
- doc.add(newNumericField("date",Field.Store.YES,true).setLongValue(dates[i].getTime()));
- Stringet=emails[i].substring(emails[i].lastIndexOf("@")+1);
- System.out.println(et);
- if(scores.containsKey(et)){
- doc.setBoost(scores.get(et));
- }else{
- doc.setBoost(0.5f);
- }
- writer.addDocument(doc);
- }
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(LockObtainFailedExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }finally{
- try{
- if(writer!=null)writer.close();
- }catch(CorruptIndexExceptione){
- e.printStackTrace();
- }catch(IOExceptione){
- e.printStackTrace();
- }
- }
- }
- }
工程下载地址:http://download.csdn.net/detail/wxwzy738/5256553
相关推荐
Lucene各版本间变化较大,lucene官方的turtial里面很多还是lucene-3.x.x的版本,这是Lucene实战(中文版第二版)对应Lucene版本,有需要的拿去用。
lucene in action 第二版(pdf),内为英文彩色原版。
第二个是 RAMDirectory,它表示一个存储在内存当中的索引的位置。 public void add(Query query, BooleanClause.Occur occur) BooleanClause用于表示布尔查询子句关系的类,包括: BooleanClause.Occur.MUST,...
Lucene实战(第二版)源代码
《Lucene实战(第2版)》基于Apache的Lucene 3.0,从Lucene核心、Lucene应用、案例分析3个方面详细系统地介绍了Lucene,包括认识Lucene、建立索引、为应用程序添加搜索功能、高级搜索技术、扩展搜索、使用Tika提取...
《Lucene实战 第2版 》基于Apache的Lucene 3 0 从Lucene核心 Lucene应用 案例分析3个方面详细系统地介绍了Lucene 包括认识Lucene 建立索引 为应用程序添加搜索功能 高级搜索技术 扩展搜索 使用Tika提取文本 Lucene...
《Lucene实战(第2版)》基于apache的Lucene3.0,从Lucene核心、Lucene应用、案例分析3个方面详细系统地介绍了Lucene,包括认识Lucene、建立索引、为应用程序添加搜索功能、高级搜索技术、扩展搜索、使用tika提取...
《Lucene实战(第2版)》基于Apache的Lucene 3.0,从Lucene核心、Lucene应用、案例分析3个方面详细系统地介绍了Lucene,包括认识Lucene、建立索引、为应用程序添加搜索功能、高级搜索技术、扩展搜索、使用Tika提取文本...
第2部分lucene的应用 通过对lucene内置工具的介绍 展示了lucene技术的高级应用和在各种程序语言上的移植 本书既可作为学习材料 又可以作为参考手册 它适合于已经熟悉基本java编程的读者 以及希望能够把强大的搜索...
lcene实战(第2版)》基于apache的lucene3.0,从lucene核心、lucene应用、案例分析3个方面详细系统地介绍了lucene,包括认识lucene、建立索引、为应用程序添加搜索功能、高级搜索技术、扩展搜索、使用tika提取文本、...
第2部分Lucene的应用,通过对Lucene内置工具的介绍,展示了Lucene技术的高级应用和在各种程序语言上的移植。. 本书既可作为学习材料,又可以作为参考手册。它适合于已经熟悉基本Java编程的读者,以及希望能够把强大...
lecene4.x教程,有很多简单明了的demo,是学习开发的好助手
lucene in action 第二版中文版,
主要讲解lucene的使用,如果要实现全文(文本)检索,这将是一本很强大很实用的教程,不可多得
解密搜索引擎技术实战Lucene&Java精华版(2)-补第5章p1 解密搜索引擎技术实战Lucene&Java精华版(3)-补第5章p2 解密搜索引擎技术实战Lucene&Java精华版(4)-补第6章 解密搜索引擎技术实战Lucene&Java精华版(5)-libp1 ...
Lucene实战(第2版)_中文版
Lucene实战(第2版).pdf
为使用最好的java开源搜索引擎提供详细的细节描述
Lucene In Action第二版的中文及英文PDF,均带有书签。