Bootstrap

[NLP]OpenNLP词形还原器(Lemmatizer)的使用

Lemmatizer

词形还原,把用POS tagger标注格式的词还原为标注前的格式。如
输入:
Rockwell_NNP International_NNP Corp._NNP 's_POS Tulsa_NNP unit_NN said_VBD it_PRP
或者,输入三列,第一列为原词,第二列为词性标注,第三列为lemma的词形

He        PRP  he
reckons   VBZ  reckon
the       DT   the
current   JJ   current
accounts  NNS  account
deficit   NN   deficit
will      MD   will
narrow    VB   narrow
to        TO   to
only      RB   only
#         #    #
1.8       CD   1.8
millions  CD   million
in        IN   in
September NNP  september
.         .    O

输出:
Rockwell NNP rockwell
International NNP international
Corp. NNP corp.
's POS 's
Tulsa NNP tulsa
unit NN unit
said VBD say
it PRP it

模型训练


```java
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import opennlp.tools.lemmatizer.LemmaSample;
import opennlp.tools.lemmatizer.LemmaSampleStream;
import opennlp.tools.lemmatizer.LemmatizerEvaluator;
import opennlp.tools.lemmatizer.LemmatizerFactory;
import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;

public class LemmatizerTrain {

	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		String rootDir = System.getProperty("user.dir") + File.separator;
		
		String fileResourcesDir = rootDir + "resources" + File.separator;
		String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
		
		//训练数据的路径
		 String filePath = fileResourcesDir + "lemmatizer.txt";
		//训练后模型的保存路径
		 String modelPath = modelResourcesDir + "lemmatizer-my.bin";
			
			//按行读取数据
		InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
		ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
		
		//按行读取数据
		ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream(lineStream);
		LemmatizerFactory factory =new LemmatizerFactory();

		//训练模型
		LemmatizerModel model =LemmatizerME.train("en",sampleStream,  TrainingParameters.defaultParams(),factory);
		 
		//保存模型
		FileOutputStream fos=new FileOutputStream(new File(modelPath));
		 OutputStream modelOut = new BufferedOutputStream(fos);
		 model.serialize(modelOut);
		 
		 //评估模型
		 LemmatizerEvaluator evaluator=new LemmatizerEvaluator(new LemmatizerME(model));
		 evaluator.evaluate(sampleStream);
		System.out.println("正确的词数:"+ evaluator.getWordAccuracy());		 
	}
}

词形还原


```java
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel;

public class LemmatizerPredit {

	public static void main(String[] args) throws IOException {
		// TODO Auto-generated method stub
		String rootDir = System.getProperty("user.dir") + File.separator;
		
		String fileResourcesDir = rootDir + "resources" + File.separator;
		String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
		
		//String filePath = fileResourcesDir + "sentenceDetector.txt";
		String modelPath = modelResourcesDir + "lemmatizer-my.bin";

		InputStream modelIn = new FileInputStream(modelPath) ;
		//加载模型
		LemmatizerModel model = new LemmatizerModel(modelIn);
		//实例化模型
		LemmatizerME lemmatizer = new LemmatizerME(model);
		//词形还原
		String[] tokens = new String[] { "Rockwell", "International", "Corp.", "'s",
			    "Tulsa", "unit", "said", "it", "signed", "a", "tentative", "agreement",
			    "extending", "its", "contract", "with", "Boeing", "Co.", "to",
			    "provide", "structural", "parts", "for", "Boeing", "'s", "747",
			    "jetliners", "." };

			String[] postags = new String[] { "NNP", "NNP", "NNP", "POS", "NNP", "NN",
			    "VBD", "PRP", "VBD", "DT", "JJ", "NN", "VBG", "PRP$", "NN", "IN",
			    "NNP", "NNP", "TO", "VB", "JJ", "NNS", "IN", "NNP", "POS", "CD", "NNS",
			    "." };
		String[] lemmas =lemmatizer.lemmatize(tokens, postags);
		 
		for(String str:lemmas){
			System.out.println(str);
		} 
	}
}

悦读

道可道,非常道;名可名,非常名。 无名,天地之始,有名,万物之母。 故常无欲,以观其妙,常有欲,以观其徼。 此两者,同出而异名,同谓之玄,玄之又玄,众妙之门。

;