当前位置:网站首页>用Stanford Parse(智能语言处理)去实现分词器

用Stanford Parse(智能语言处理)去实现分词器

2022-07-22 21:23:00 武念

昨天研究学习了一下Stanford Parse ,想利用Stanford Parse 智能切词的效果结合到lucene 分词器中的想法;由于项目时间

仓促,部分研究没有完成。代码还存在bug,希望有这方面想法的小伙伴们,能完善。。

lucene版本:lucene4.10.3,引入jar包:stanford-parser-3.3.0-models.jar ,stanford-parser.jar

先构建分词器测试类,代码如下:


   
    
  1. package main.test;
  2. import java.io.IOException;
  3. import java.io.StringReader;
  4. import org.apache.lucene.analysis.Analyzer;
  5. import org.apache.lucene.analysis.TokenStream;
  6. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  7. import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  8. public class AnalyzerTest {
  9. public static void analyzer (Analyzer analyzer,String text){
  10. try {
  11. System.out.println( "分词器名称:"+analyzer.getClass());
  12. //获取tokenStream流
  13. TokenStream tokenStream=analyzer.tokenStream( "", new StringReader(text));
  14. tokenStream.reset();
  15. while(tokenStream.incrementToken()){
  16. CharTermAttribute cta1=tokenStream.getAttribute(CharTermAttribute.class);
  17. OffsetAttribute ofa=tokenStream.getAttribute(OffsetAttribute.class);
  18. //位置增量的属性,存储词之间的距离
  19. // PositionIncrementAttribute pia=tokenStream.getAttribute(PositionIncrementAttribute.class);
  20. // System.out.print(pia.getPositionIncrement()+":");
  21. System.out.print( "["+ofa.startOffset()+ "-"+ofa.endOffset()+ "]-->"+cta1.toString()+ "\n");
  22. }
  23. tokenStream.end();
  24. tokenStream.close();
  25. } catch (IOException e) {
  26. // TODO Auto-generated catch block
  27. e.printStackTrace();
  28. }
  29. }
  30. public static void main (String[] args){
  31. String chText = "清华大学生说正在研究生命起源";
  32. Analyzer analyzer = new NlpHhcAnalyzer();
  33. analyzer(analyzer,chText);
  34. }
  35. }

重新定义一个新的分词器,实现Analyzer类,重写其:TokenStreamComponentscreateComponents 方法。这里注意:lucene4.x版

本的TokenStreamComponents 以组件的形式包含的lucene3.x版本的 filter和 tokenizer。


   
    
  1. package main.test;
  2. import java.io.Reader;
  3. import org.apache.lucene.analysis.Analyzer;
  4. public class NlpHhcAnalyzer extends Analyzer{
  5. @Override
  6. protected TokenStreamComponents createComponents (String arg0, Reader reader) {
  7. return new TokenStreamComponents( new aaa(reader));
  8. }
  9. }


实现新的一个Tokenizer 类aaa: 这部分代码还有bug,没有时间去调试学习。。有时间的朋友可以试着完善一下。


   
    
  1. package main.test;
  2. import java.io.IOException;
  3. import java.io.Reader;
  4. import java.util.Collection;
  5. import java.util.concurrent.ConcurrentHashMap;
  6. import org.apache.lucene.analysis.Tokenizer;
  7. import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  8. import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  9. import org.apache.lucene.util.AttributeFactory;
  10. import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
  11. import edu.stanford.nlp.trees.Tree;
  12. import edu.stanford.nlp.trees.TypedDependency;
  13. import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure;
  14. public class aaa extends Tokenizer{
  15. //词元文本属性
  16. private CharTermAttribute termAtt;
  17. //词元位移属性
  18. private OffsetAttribute offsetAtt;
  19. //记录最后一个词元的结束位置
  20. // private int finalOffset;
  21. private String str;
  22. private LexicalizedParser lp;
  23. public aaa (Reader in) {
  24. super(in);
  25. StringBuilder sb= new StringBuilder();
  26. try {
  27. for ( int i = 0; i < 100; i++) {
  28. sb.append(( char) in.read());
  29. }
  30. } catch (IOException e) {
  31. e.printStackTrace();
  32. }
  33. str=sb.toString();
  34. String modelpath= "edu/stanford/nlp/models/lexparser/xinhuaFactoredSegmenting.ser.gz";
  35. lp = LexicalizedParser.loadModel(modelpath);
  36. offsetAtt = addAttribute(OffsetAttribute.class);
  37. termAtt = addAttribute(CharTermAttribute.class);
  38. }
  39. protected aaa (AttributeFactory factory, Reader input) {
  40. super(factory, input);
  41. // TODO Auto-generated constructor stub
  42. }
  43. @SuppressWarnings("unchecked")
  44. @Override
  45. public boolean incrementToken () throws IOException {
  46. //清除所有的词元属性
  47. clearAttributes();
  48. Tree t = lp.parse(str);
  49. ChineseGrammaticalStructure gs = new ChineseGrammaticalStructure(t);
  50. Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
  51. ConcurrentHashMap map= new ConcurrentHashMap();
  52. for( int i= 0;i<tdl.size();i++)
  53. {
  54. TypedDependency td = (TypedDependency)tdl.toArray()[i];
  55. String term = td.dep().nodeString().trim();
  56. //将Lexeme转成Attributes
  57. //设置词元文本
  58. termAtt.append(term);
  59. //设置词元长度
  60. termAtt.setLength(term.length());
  61. //设置词元位移
  62. if(i== 0){
  63. map.put( "beginPosition", i*term.length());
  64. } else{
  65. map.put( "beginPosition", Integer.parseInt(map.get( "beginPosition").toString())+term.length());
  66. }
  67. offsetAtt.setOffset(Integer.parseInt(map.get( "beginPosition").toString()), Integer.parseInt(map.get( "beginPosition").toString())+term.length());
  68. //记录分词的最后位置
  69. // finalOffset = nextLexeme.getEndPosition();
  70. //返会true告知还有下个词元
  71. return true;
  72. }
  73. //返会false告知词元输出完毕
  74. return false;
  75. }
  76. }


原网站

版权声明
本文为[武念]所创,转载请带上原文链接,感谢
https://blog.csdn.net/weixin_43813200/article/details/125903677

随机推荐