当前位置：网站首页>用Stanford Parse（智能语言处理）去实现分词器

用Stanford Parse（智能语言处理）去实现分词器

2022-07-22 21:23:00 【武念】

昨天研究学习了一下Stanford Parse ，想利用Stanford Parse 智能切词的效果结合到lucene 分词器中的想法；由于项目时间

仓促，部分研究没有完成。代码还存在bug，希望有这方面想法的小伙伴们，能完善。。

lucene版本：lucene4.10.3，引入jar包：stanford-parser-3.3.0-models.jar ，stanford-parser.jar

先构建分词器测试类，代码如下：


   
    
      
       
        
       
       
        
        package main.test; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        import java.io.IOException; 
        
      
      
       
        
       
       
        
        import java.io.StringReader; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        import org.apache.lucene.analysis.Analyzer; 
        
      
      
       
        
       
       
        
        import org.apache.lucene.analysis.TokenStream; 
        
      
      
       
        
       
       
        
        import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 
        
      
      
       
        
       
       
        
        import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        public 
        class 
        AnalyzerTest { 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        public 
        static 
        void 
        analyzer 
        (Analyzer analyzer,String text){ 
        
      
      
       
        
       
       
        
        try { 
        
      
      
       
        
       
       
       
         System.out.println( 
        "分词器名称："+analyzer.getClass()); 
        
      
      
       
        
       
       
        
        //获取tokenStream流 
        
      
      
       
        
       
       
       
         TokenStream tokenStream=analyzer.tokenStream( 
        "", 
        new 
        StringReader(text)); 
        
      
      
       
        
       
       
       
         tokenStream.reset(); 
        
      
      
       
        
       
       
        
        while(tokenStream.incrementToken()){ 
        
      
      
       
        
       
       
       
         CharTermAttribute cta1=tokenStream.getAttribute(CharTermAttribute.class); 
        
      
      
       
        
       
       
       
         OffsetAttribute ofa=tokenStream.getAttribute(OffsetAttribute.class); 
        
      
      
       
        
       
       
        
        //位置增量的属性，存储词之间的距离  
        
      
      
       
        
       
       
        
        // PositionIncrementAttribute pia=tokenStream.getAttribute(PositionIncrementAttribute.class); 
        
      
      
       
        
       
       
        
        // System.out.print(pia.getPositionIncrement()+":");  
        
      
      
       
        
       
       
       
         System.out.print( 
        "["+ofa.startOffset()+ 
        "-"+ofa.endOffset()+ 
        "]-->"+cta1.toString()+ 
        "\n"); 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
       
         tokenStream.end(); 
        
      
      
       
        
       
       
       
         tokenStream.close(); 
        
      
      
       
        
       
       
       
         } 
        catch (IOException e) { 
        
      
      
       
        
       
       
        
        // TODO Auto-generated catch block 
        
      
      
       
        
       
       
       
         e.printStackTrace(); 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
        
        public 
        static 
        void 
        main 
        (String[] args){ 
        
      
      
       
        
       
       
        
        String 
        chText 
        = 
        "清华大学生说正在研究生命起源"; 
        
      
      
       
        
       
       
        
        Analyzer 
        analyzer 
        = 
        new 
        NlpHhcAnalyzer(); 
        
      
      
       
        
       
       
       
         analyzer(analyzer,chText); 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
       
         }

重新定义一个新的分词器，实现Analyzer类，重写其：TokenStreamComponentscreateComponents 方法。这里注意：lucene4.x版

本的TokenStreamComponents 以组件的形式包含的lucene3.x版本的 filter和 tokenizer。


   
    
      
       
        
       
       
        
        package main.test; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        import java.io.Reader; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        import org.apache.lucene.analysis.Analyzer; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        public 
        class 
        NlpHhcAnalyzer 
        extends 
        Analyzer{ 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        @Override 
        
      
      
       
        
       
       
        
        protected TokenStreamComponents 
        createComponents 
        (String arg0, Reader reader) { 
        
      
      
       
        
       
       
        
        return 
        new 
        TokenStreamComponents( 
        new 
        aaa(reader)); 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
       
         }

实现新的一个Tokenizer 类aaa：这部分代码还有bug，没有时间去调试学习。。有时间的朋友可以试着完善一下。


   
    
      
       
        
       
       
        
        package main.test; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        import java.io.IOException; 
        
      
      
       
        
       
       
        
        import java.io.Reader; 
        
      
      
       
        
       
       
        
        import java.util.Collection; 
        
      
      
       
        
       
       
        
        import java.util.concurrent.ConcurrentHashMap; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        import org.apache.lucene.analysis.Tokenizer; 
        
      
      
       
        
       
       
        
        import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 
        
      
      
       
        
       
       
        
        import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 
        
      
      
       
        
       
       
        
        import org.apache.lucene.util.AttributeFactory; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        import edu.stanford.nlp.parser.lexparser.LexicalizedParser; 
        
      
      
       
        
       
       
        
        import edu.stanford.nlp.trees.Tree; 
        
      
      
       
        
       
       
        
        import edu.stanford.nlp.trees.TypedDependency; 
        
      
      
       
        
       
       
        
        import edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        public 
        class 
        aaa 
        extends 
        Tokenizer{ 
        
      
      
       
        
       
       
        
        //词元文本属性 
        
      
      
       
        
       
       
        
        private CharTermAttribute termAtt; 
        
      
      
       
        
       
       
        
        //词元位移属性 
        
      
      
       
        
       
       
        
        private OffsetAttribute offsetAtt; 
        
      
      
       
        
       
       
        
        //记录最后一个词元的结束位置 
        
      
      
       
        
       
       
        
        // private int finalOffset; 
        
      
      
       
        
       
       
        
        private String str; 
        
      
      
       
        
       
       
        
        private LexicalizedParser lp; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        public 
        aaa 
        (Reader in) { 
        
      
      
       
        
       
       
        
        super(in); 
        
      
      
       
        
       
       
       
         StringBuilder sb= 
        new 
        StringBuilder(); 
        
      
      
       
        
       
       
        
        try { 
        
      
      
       
        
       
       
        
        for ( 
        int 
        i 
        = 
        0; i < 
        100; i++) { 
        
      
      
       
        
       
       
       
         sb.append(( 
        char) in.read()); 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
       
         } 
        catch (IOException e) { 
        
      
      
       
        
       
       
       
         e.printStackTrace(); 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
       
         str=sb.toString(); 
        
      
      
       
        
       
       
       
         String modelpath= 
        "edu/stanford/nlp/models/lexparser/xinhuaFactoredSegmenting.ser.gz"; 
        
      
      
       
        
       
       
       
         lp = LexicalizedParser.loadModel(modelpath); 
        
      
      
       
        
       
       
       
         offsetAtt = addAttribute(OffsetAttribute.class); 
        
      
      
       
        
       
       
       
         termAtt = addAttribute(CharTermAttribute.class); 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
        
        protected 
        aaa 
        (AttributeFactory factory, Reader input) { 
        
      
      
       
        
       
       
        
        super(factory, input); 
        
      
      
       
        
       
       
        
        // TODO Auto-generated constructor stub 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        @SuppressWarnings("unchecked") 
        
      
      
       
        
       
       
        
        @Override 
        
      
      
       
        
       
       
        
        public 
        boolean 
        incrementToken 
        () 
        throws IOException { 
        
      
      
       
        
       
       
        
        //清除所有的词元属性 
        
      
      
       
        
       
       
       
         clearAttributes(); 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
        
        Tree 
        t 
        = lp.parse(str); 
        
      
      
       
        
       
       
        
        ChineseGrammaticalStructure 
        gs 
        = 
        new 
        ChineseGrammaticalStructure(t); 
        
      
      
       
        
       
       
       
         Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed(); 
        
      
      
       
        
       
       
       
         ConcurrentHashMap map= 
        new 
        ConcurrentHashMap(); 
        
      
      
       
        
       
       
        
        for( 
        int i= 
        0;i<tdl.size();i++) 
        
      
      
       
        
       
       
       
         { 
        
      
      
       
        
       
       
        
        TypedDependency 
        td 
        = (TypedDependency)tdl.toArray()[i]; 
        
      
      
       
        
       
       
        
        String 
        term 
        = td.dep().nodeString().trim(); 
        
      
      
       
        
       
       
        
        //将Lexeme转成Attributes 
        
      
      
       
        
       
       
        
        //设置词元文本 
        
      
      
       
        
       
       
       
         termAtt.append(term); 
        
      
      
       
        
       
       
        
        //设置词元长度 
        
      
      
       
        
       
       
       
         termAtt.setLength(term.length()); 
        
      
      
       
        
       
       
        
        //设置词元位移 
        
      
      
       
        
       
       
        
        if(i== 
        0){ 
        
      
      
       
        
       
       
       
         map.put( 
        "beginPosition", i*term.length()); 
        
      
      
       
        
       
       
       
         } 
        else{ 
        
      
      
       
        
       
       
       
         map.put( 
        "beginPosition", Integer.parseInt(map.get( 
        "beginPosition").toString())+term.length()); 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
       
         offsetAtt.setOffset(Integer.parseInt(map.get( 
        "beginPosition").toString()), Integer.parseInt(map.get( 
        "beginPosition").toString())+term.length()); 
        
      
      
       
        
       
       
        
        //记录分词的最后位置 
        
      
      
       
        
       
       
        
        // finalOffset = nextLexeme.getEndPosition(); 
        
      
      
       
        
       
       
        
        //返会true告知还有下个词元 
        
      
      
       
        
       
       
        
        return 
        true; 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
        
        //返会false告知词元输出完毕 
        
      
      
       
        
       
       
        
        return 
        false; 
        
      
      
       
        
       
       
       
         } 
        
      
      
       
        
       
       
        
        
      
      
       
        
       
       
       
         }

原网站

版权声明
本文为[武念]所创，转载请带上原文链接，感谢
https://blog.csdn.net/weixin_43813200/article/details/125903677

当前位置：网站首页>用Stanford Parse（智能语言处理）去实现分词器

用Stanford Parse（智能语言处理）去实现分词器

边栏推荐

猜你喜欢

随机推荐