Lemmatizer
词形还原,把用POS tagger标注格式的词还原为标注前的格式。如
输入:
Rockwell_NNP International_NNP Corp._NNP 's_POS Tulsa_NNP unit_NN said_VBD it_PRP
或者,输入三列,第一列为原词,第二列为词性标注,第三列为lemma的词形
He PRP he
reckons VBZ reckon
the DT the
current JJ current
accounts NNS account
deficit NN deficit
will MD will
narrow VB narrow
to TO to
only RB only
# # #
1.8 CD 1.8
millions CD million
in IN in
September NNP september
. . O
输出:
Rockwell NNP rockwell
International NNP international
Corp. NNP corp.
's POS 's
Tulsa NNP tulsa
unit NN unit
said VBD say
it PRP it
模型训练
```java
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import opennlp.tools.lemmatizer.LemmaSample;
import opennlp.tools.lemmatizer.LemmaSampleStream;
import opennlp.tools.lemmatizer.LemmatizerEvaluator;
import opennlp.tools.lemmatizer.LemmatizerFactory;
import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.MarkableFileInputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
public class LemmatizerTrain {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//训练数据的路径
String filePath = fileResourcesDir + "lemmatizer.txt";
//训练后模型的保存路径
String modelPath = modelResourcesDir + "lemmatizer-my.bin";
//按行读取数据
InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File(filePath));
ObjectStream<String> lineStream = new PlainTextByLineStream(inputStreamFactory, StandardCharsets.UTF_8);
//按行读取数据
ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream(lineStream);
LemmatizerFactory factory =new LemmatizerFactory();
//训练模型
LemmatizerModel model =LemmatizerME.train("en",sampleStream, TrainingParameters.defaultParams(),factory);
//保存模型
FileOutputStream fos=new FileOutputStream(new File(modelPath));
OutputStream modelOut = new BufferedOutputStream(fos);
model.serialize(modelOut);
//评估模型
LemmatizerEvaluator evaluator=new LemmatizerEvaluator(new LemmatizerME(model));
evaluator.evaluate(sampleStream);
System.out.println("正确的词数:"+ evaluator.getWordAccuracy());
}
}
词形还原
```java
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import opennlp.tools.lemmatizer.LemmatizerME;
import opennlp.tools.lemmatizer.LemmatizerModel;
public class LemmatizerPredit {
public static void main(String[] args) throws IOException {
// TODO Auto-generated method stub
String rootDir = System.getProperty("user.dir") + File.separator;
String fileResourcesDir = rootDir + "resources" + File.separator;
String modelResourcesDir = rootDir + "opennlpmodel" + File.separator;
//String filePath = fileResourcesDir + "sentenceDetector.txt";
String modelPath = modelResourcesDir + "lemmatizer-my.bin";
InputStream modelIn = new FileInputStream(modelPath) ;
//加载模型
LemmatizerModel model = new LemmatizerModel(modelIn);
//实例化模型
LemmatizerME lemmatizer = new LemmatizerME(model);
//词形还原
String[] tokens = new String[] { "Rockwell", "International", "Corp.", "'s",
"Tulsa", "unit", "said", "it", "signed", "a", "tentative", "agreement",
"extending", "its", "contract", "with", "Boeing", "Co.", "to",
"provide", "structural", "parts", "for", "Boeing", "'s", "747",
"jetliners", "." };
String[] postags = new String[] { "NNP", "NNP", "NNP", "POS", "NNP", "NN",
"VBD", "PRP", "VBD", "DT", "JJ", "NN", "VBG", "PRP$", "NN", "IN",
"NNP", "NNP", "TO", "VB", "JJ", "NNS", "IN", "NNP", "POS", "CD", "NNS",
"." };
String[] lemmas =lemmatizer.lemmatize(tokens, postags);
for(String str:lemmas){
System.out.println(str);
}
}
}