编译Ansj之Solr插件

Tags: ansj solr lucene

Ansj是一个比较优秀的中文分词组件,具体情况就不在本文介绍了。ansj作者在其官方代码中,提供了对lucene接口的支持。如果用在Solr下,还需要简单的扩展一下。

1、基于maven管理

ansj是基于maven进行开发管理的。我们首先修改一下其pom.xml,具体如下所示:

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
	<modelVersion>4.0.0</modelVersion>
	<parent>
		<groupId>org.ansj</groupId>
		<artifactId>MavenAccount-aggregator</artifactId>
		<version>0.0.1</version>
		<relativePath>../pom.xml</relativePath>
	</parent>
	<artifactId>ansj_lucene4_plug</artifactId>
	<version>2.0.2</version>
	<packaging>jar</packaging>
	<name>ansj_lucene4_plug</name>
 	<properties>
	 <solr.version>4.8.0</solr.version>
    </properties>
	<dependencies>
		<dependency>
			<groupId>org.ansj</groupId>
			<artifactId>ansj_seg</artifactId>
			<version>2.0.5</version>
			<classifier>min</classifier>
			<scope>provided</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>${solr.version}</version>
			<scope>provided</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-highlighter</artifactId>
			<version>${solr.version}</version>
			<scope>provided</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queries</artifactId>
			<version>${solr.version}</version>
			<scope>provided</scope>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>${solr.version}</version>
			<scope>provided</scope>
		</dependency>
	   <dependency>
			<groupId>org.apache.solr</groupId>
			<artifactId>solr-dataimporthandler</artifactId>
			<version>${solr.version}</version>
			<scope>provided</scope>
	  </dependency>
		<dependency>
			<groupId>junit</groupId>
			<artifactId>junit</artifactId>
			<version>4.4</version>
			<scope>test</scope>
		</dependency>
	</dependencies>
</project>

其中,代码依赖的配置项:<scope>provided</scope> 表示只用于代码编译阶段。依赖关系整理好以后,写一个TokenizerFactory类,用于solr中配置使用,代码如下:

package org.ansj.solr;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.ansj.lucene.util.AnsjTokenizer;
import org.ansj.splitWord.analysis.IndexAnalysis;
import org.ansj.splitWord.analysis.ToAnalysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.util.AttributeSource.AttributeFactory;
public class AnsjTokenizerFactory extends TokenizerFactory{
	boolean pstemming;
	boolean isQuery;
	private String stopwordsDir;
	public Set<String> filter;  
	public AnsjTokenizerFactory(Map<String, String> args) {
		super(args);
		assureMatchVersion();
		isQuery = getBoolean(args, "isQuery", true);
		pstemming = getBoolean(args, "pstemming", false);
		stopwordsDir = get(args,"words");
		addStopwords(stopwordsDir);
	}
	//add stopwords list to filter
	private void addStopwords(String dir) {
		if (dir == null){
			System.out.println("no stopwords dir");
			return;
		}
		//read stoplist
		System.out.println("stopwords: " + dir);
		filter = new HashSet<String>();
		File file = new File(dir); 
		InputStreamReader reader;
		try {
			reader = new InputStreamReader(new FileInputStream(file),"UTF-8");
			BufferedReader br = new BufferedReader(reader); 
			String word = br.readLine();  
			while (word != null) {
				filter.add(word);
				word = br.readLine(); 
			}  
		} catch (FileNotFoundException e) {
			System.out.println("No stopword file found");
		} catch (IOException e) {
			System.out.println("stopword file io exception");
		}	  
	}
	@Override
	public Tokenizer create(AttributeFactory factory, Reader input) {
		if(isQuery == true){
			//query
			return new AnsjTokenizer(new ToAnalysis(new BufferedReader(input)), input, filter, pstemming);
		} else {
			//index
			return new AnsjTokenizer(new IndexAnalysis(new BufferedReader(input)), input, filter, pstemming);
		}
	}	   
}

pstemming 参数是ansj需要的参数。

isQuery 是用于判断是查询还是索引,一般搜索index阶段分词比较细,查询的分词比较粗。

2、编译jar包。

代码结构如下:

编写mavn编译命令:mvn install -DskipTests=true# 忽略单元测试编译。

执行编译:

[INFO] Scanning for projects...
[INFO]                                                                         
[INFO] ------------------------------------------------------------------------
[INFO] Building ansj_lucene4_plug 2.0.2
[INFO] ------------------------------------------------------------------------
[INFO] 
[INFO] --- maven-clean-plugin:2.4.1:clean (default-clean) @ ansj_lucene4_plug ---
[INFO] Deleting R:\ansj-seg\ansj_seg\plug\ansj_lucene4_plug\target
[INFO] 
[INFO] --- maven-resources-plugin:2.4.3:resources (default-resources) @ ansj_lucene4_plug ---
[INFO] Using 'UTF-8' encoding to copy filtered resources.
[INFO] skip non existing resourceDirectory R:\ansj-seg\ansj_seg\plug\ansj_lucene4_plug\src\main\resources
[INFO] 
[INFO] --- maven-compiler-plugin:2.3.2:compile (default-compile) @ ansj_lucene4_plug ---
[INFO] Compiling 5 source files to R:\ansj-seg\ansj_seg\plug\ansj_lucene4_plug\target\classes
[INFO] 
[INFO] --- maven-resources-plugin:2.4.3:testResources (default-testResources) @ ansj_lucene4_plug ---
[INFO] Using 'UTF-8' encoding to copy filtered resources.
[INFO] skip non existing resourceDirectory R:\ansj-seg\ansj_seg\plug\ansj_lucene4_plug\src\test\resources
[INFO] 
[INFO] --- maven-compiler-plugin:2.3.2:testCompile (default-testCompile) @ ansj_lucene4_plug ---
[INFO] Compiling 3 source files to R:\ansj-seg\ansj_seg\plug\ansj_lucene4_plug\target\test-classes
[INFO] 
[INFO] --- maven-surefire-plugin:2.7.1:test (default-test) @ ansj_lucene4_plug ---
[INFO] Tests are skipped.
[INFO] 
[INFO] --- maven-jar-plugin:2.3.1:jar (default-jar) @ ansj_lucene4_plug ---
[INFO] Building jar: R:\ansj-seg\ansj_seg\plug\ansj_lucene4_plug\target\ansj_lucene4_plug-2.0.2.jar
[INFO] 
[INFO] --- maven-install-plugin:2.3.1:install (default-install) @ ansj_lucene4_plug ---
[INFO] Installing R:\ansj-seg\ansj_seg\plug\ansj_lucene4_plug\target\ansj_lucene4_plug-2.0.2.jar to C:\Users\GCZX-016\.m2\repository\org\ansj\ansj_lucene4_plug\2.0.2\ansj_lucene4_plug-2.0.2.jar
[INFO] Installing R:\ansj-seg\ansj_seg\plug\ansj_lucene4_plug\pom.xml to C:\Users\GCZX-016\.m2\repository\org\ansj\ansj_lucene4_plug\2.0.2\ansj_lucene4_plug-2.0.2.pom
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 8.149s
[INFO] Finished at: Tue May 05 15:29:19 CST 2015
[INFO] Final Memory: 27M/245M
[INFO] ------------------------------------------------------------------------

本文链接:http://www.4byte.cn/learning/119802/bian-yi-ansj-zhi-solr-cha-jian.html