JAV程序解析搜狗词库scel文件格式

qindongliang1922

浏览: 2145187 次
性别:
来自: 北京

最近访客更多访客>>

godandghost

youhere

tanss

fengshuo850420

博主相关

博客

微博

相册

留言

关于我

博客专栏

: 证道Lucene4
浏览量：116259

: 证道Hadoop
浏览量：124512

: 证道shell编程
浏览量：58370

: ELK修真
浏览量：70306

文章分类

社区版块

存档分类

博客分类：

JAVA

java scel 词库

在做一个电商的网站的初期时，我们常常面临词库的问题，因为我们并没有比较好的词库，这时候呢，我们就可以从网上下一些，别人有的词库，这些词库有淘宝的，有搜狗的，搜狗的分类比较细，我们可以根据下载与我们行业比较相关的词库，但这些词库一般都是scel格式的，直接使用JAVA解析，是没法解析的，如果遇到这种情况可用散仙下面的这个类，来解析，经测试无乱码现象，解析完整度还不错。

源码如下:

package com.qin.parse.scel;



import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;

public class SougouScelReader {

    public SougouScelMdel read(File file) throws IOException {
        return read(new FileInputStream(file));
    }

    public SougouScelMdel read(URL url) throws IOException {
        return read(url.openStream());
    }

    protected ByteArrayOutputStream output=new ByteArrayOutputStream();

    protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {
        int read=reads[0];
        input.skip(pos-read);
        read=pos;
        output.reset();
        while(true) {
            int c1 = input.read();
            int c2 = input.read();
            read+=2;
            if(c1==0 && c2==0) {
                break;
            } else {
                output.write(c1);
                output.write(c2);
            }
        }
        reads[0]=read;
        return new String(output.toByteArray(),encoding);
    }

    protected static String encoding = "UTF-16LE";

    public SougouScelMdel read(InputStream in) throws IOException {
        SougouScelMdel model = new SougouScelMdel();
        DataInputStream input = new DataInputStream(in);
        int read;
        try {
            byte[] bytes = new byte[4];
            input.readFully(bytes);
            assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
            input.readFully(bytes);
            int flag1 = bytes[0];
            assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
            int[] reads=new int[]{8};
            model.setName(readString(input,0x130,reads));
            model.setType(readString(input,0x338,reads));
            model.setDescription(readString(input,0x540,reads));
            model.setSample(readString(input,0xd40,reads));
            read = reads[0];
            input.skip(0x1540 - read);
            read=0x1540;
            input.readFully(bytes);
            read += 4;
            assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
            bytes = new byte[128];
            Map<Integer, String> pyMap = new LinkedHashMap<Integer, String>();
            while (true) {
                int mark = readUnsignedShort(input);
                int size = input.readUnsignedByte();
                input.skip(1);
                read += 4;
                assert (size > 0 && (size % 2) == 0);
                input.readFully(bytes, 0, size);
                read += size;
                String py = new String(bytes, 0, size, encoding);
                //System.out.println(py);
                pyMap.put(mark, py);
                if ("zuo".equals(py)) {
                    break;
                }
            }
            if (flag1 == 0x44) {
                input.skip(0x2628 - read);
            } else if (flag1 == 0x45) {
                input.skip(0x26C4 - read);
            } else {
                throw new RuntimeException("出现意外，联系作者");
            }
            StringBuffer buffer = new StringBuffer();
            Map<String, List<String>> wordMap = new LinkedHashMap<String, List<String>>();
            while (true) {
                int size = readUnsignedShort(input);
                if (size < 0) {
                    break;
                }
                int count = readUnsignedShort(input);
                int len = count / 2;
                assert (len * 2 == count);
                buffer.setLength(0);
                for (int i = 0; i < len; i++) {
                    int key = readUnsignedShort(input);
                    buffer.append(pyMap.get(key)).append("'");
                }
                buffer.setLength(buffer.length() - 1);
                String py = buffer.toString();
                List<String> list = wordMap.get(py);
                if (list == null) {
                    list = new ArrayList<String>();
                    wordMap.put(py, list);
                }
                for (int i = 0; i < size; i++) {
                    count = readUnsignedShort(input);
                    if (count > bytes.length) {
                        bytes = new byte[count];
                    }
                    input.readFully(bytes, 0, count);
                    String word = new String(bytes, 0, count, encoding);
                    //接下来12个字节可能是词频或者类似信息
                    input.skip(12);
                    list.add(word);
                }
            }
            //System.out.println(wordMap.size());
            model.setWordMap(wordMap);
            return model;
        } finally {
            in.close();
        }
    }

    protected final int readUnsignedShort(InputStream in) throws IOException {
        int ch1 = in.read();
        int ch2 = in.read();
        if ((ch1 | ch2) < 0) {
            return Integer.MIN_VALUE;
        }
        return (ch2 << 8) + (ch1 << 0);
    }

}

//自行将此类提出来为public class
class SougouScelMdel {

    private Map<String, List<String>> wordMap;

    private String name;
    private String type;
    private String description;
    private String sample;

    public Map<String, List<String>> getWordMap() {
        return wordMap;
    }

    void setWordMap(Map<String, List<String>> wordMap) {
        this.wordMap = wordMap;
    }

    public String getType() {
        return type;
    }

    public void setType(String type) {
        this.type = type;
    }

    public String getDescription() {
        return description;
    }

    public void setDescription(String description) {
        this.description = description;
    }

    public String getSample() {
        return sample;
    }

    public void setSample(String sample) {
        this.sample = sample;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    
    
    
    
}

package com.qin.parse.scel;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Map.Entry;

/**
 * 解析sogo词库工具类
 * 
 * 
 * **/
public class ParseSogo {
	
	public static void main(String[] args)throws Exception {
		
   	 sogou("D:\\词库\\dianshang.scel","D:\\词库\\goods1.txt",false);
	}
   
	/**
	 * 读取scel的词库文件
	 * 生成txt格式的文件
	 * @param inputPath 输入路径
	 * @param outputPath 输出路径
	 * @param isAppend  是否拼接追加词库内容 
	 * true 代表追加,false代表重建
	 * 
	 * **/
   private static void sogou(String inputPath,String outputPath,boolean isAppend) throws IOException{  
       File file=new File(inputPath);  
       if(!isAppend){
       if(Files.exists(Paths.get(outputPath),LinkOption.values())){
    	   System.out.println("存储此文件已经删除");
    	   Files.deleteIfExists(Paths.get(outputPath));
    	   
       }
       }
       RandomAccessFile raf=new RandomAccessFile(outputPath, "rw");
      
       int count=0;
       SougouScelMdel model = new SougouScelReader().read(file);  
       Map<String,List<String>> words = model.getWordMap(); //词<拼音,词>  
       Set<Entry<String,List<String>>> set = words.entrySet();  
       Iterator<Entry<String,List<String>>> iter = set.iterator();  
       while(iter.hasNext()){  
           Entry<String,List<String>> entry = iter.next();  
           List<String> list = entry.getValue();  
           int size = list.size();  
           for(int i = 0; i < size; i++){  
               String word = list.get(i);  
               
               //System.out.println(word); 
               raf.seek(raf.getFilePointer());
               raf.write((word+"\n").getBytes());//写入txt文件
               count++;
               
               
           }  
       }  
       raf.close();
       System.out.println("生成txt成功！,总计写入: "+count+" 条数据！");
   }  

}

分享到：

JAVA各种排序思想和示例 | 如何将Lucene索引写入Hadoop？

2014-07-04 17:36
浏览 3806
评论(1)
论坛回复 / 浏览 (0 / 3429)
分类:编程语言
查看更多

1 楼 x0070704 2014-12-03

这个工具好强大，大部分的文件都被解析出来了，但我发现有些解析不了，会在input.readFully(bytes, 0, count);那行报IndexOutOfBoundsException错。虽然我也在研究怎样解决这问题，但鉴于自己也是刚接触，暂时未完全了解其原理，希望作者也一起解决一下

暂时发现不能解析的文件如下：
农林渔畜/渔业/船名前缀.scel
农林渔畜/渔业/海洋工程与海洋环境专有名词.scel
社会科学/教育教学/教育局股室.scel
社会科学/广告传媒/悠活网词汇.scel
工程应用/环境能源/核能词汇.scel

发表评论

您还没有登录,请您登录后再发表评论