如何用simhash算法检测文本中的长尾词相似度?
- 内容介绍
- 文章标签
- 相关推荐
本文共计554个文字,预计阅读时间需要3分钟。
plaintextpackage similarity;import java.io.File;import java.io.IOException;import java.math.BigDecimal;import java.math.BigInteger;import java.util.HashMap;import java.util.StringTokenizer;
/** * */
package similarity;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.StringTokenizer;
/**
*
* simhash
* @author tzj
*
*/
public class SimHash {
private String tokens;
private BigInteger intSimHash;
private String strSimHash;
private int hashbits = 64;
public SimHash(String tokens) {
this.tokens = tokens;
this.intSimHash = this.simHash();
}
public SimHash(String tokens, int hashbits) {
this.tokens = tokens;
this.hashbits = hashbits;
this.intSimHash = this.simHash();
}
HashMap
本文共计554个文字,预计阅读时间需要3分钟。
plaintextpackage similarity;import java.io.File;import java.io.IOException;import java.math.BigDecimal;import java.math.BigInteger;import java.util.HashMap;import java.util.StringTokenizer;
/** * */
package similarity;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.StringTokenizer;
/**
*
* simhash
* @author tzj
*
*/
public class SimHash {
private String tokens;
private BigInteger intSimHash;
private String strSimHash;
private int hashbits = 64;
public SimHash(String tokens) {
this.tokens = tokens;
this.intSimHash = this.simHash();
}
public SimHash(String tokens, int hashbits) {
this.tokens = tokens;
this.hashbits = hashbits;
this.intSimHash = this.simHash();
}
HashMap

