余弦相似度-Java代码

//大三Java作业, 仅供参考,

// @刘_学



package cn;



import java.util.ArrayList;





public class CosSimilarity

{



public CosSimilarity()

{

// TODO Auto-generated constructor stub

}



public static void main(String[] args)

{

// TODO Auto-generated method stub

String D1 = "I don't loves DataBase but java.";

String D2 = "I don't like CaoJiba but JianXu.";

PorterStemmer s = new PorterStemmer(); // 单词词形规范化

D1 = s.getStemmer(D1);

D2 = s.getStemmer(D2);

String[] s1, s2;

s1 = D1.split(" ");

s2 = D2.split(" ");

// 测试进行单词词形规范化 System.out.println(D1);

// System.out.println(D2);



ArrayList a = new ArrayList(); // 创建动态数组,记录不同的单词

ArrayList aNum = new ArrayList(); // 创建动态数组,统计不同的单词各自出现的次数

ArrayList b = new ArrayList();

ArrayList bNum = new ArrayList();



for (int i = 0; i < s1.length; i++) // 将s1复制到动态数组a, 且词频统计数组初始化

{

a.add(s1[i]);

aNum.add(i, 1);

}





for (int i = 0; i < a.size() - 1; i++) // 记录a不同单词且统计词频

{

int tem = 1; // -----------------------暂存词频

for (int j = i + 1; j < a.size(); j++)

{

if (a.get(i).equalsIgnoreCase(a.get(j)))

{

tem++;

aNum.set(i, tem);

a.remove(j);

aNum.remove(j);

}

}

}



for (int i = 0; i < s2.length; i++) // 将s2复制到动态数组b, 且词频统计数组初始化

{

b.add(s2[i]);

bNum.add(i, 1);

}



for (int i = 0; i < b.size() - 1; i++) // 记录b不同单词且统计词频

{

int tem = 1; // -----------------------暂存词频

for (int j = i + 1; j < b.size(); j++)

{

if (b.get(i).equalsIgnoreCase(b.get(j)))

{

tem++;

bNum.set(i, tem);

b.remove(j);

bNum.remove(j);

}

}

}



double denominator = 0; // 计算W1K×W2K



for (int i = 0; i < a.size(); i++) // 计算W1K×W2K

{

for (int j = 0; j < b.size(); j++)

{

if (a.get(i).equals(b.get(j)))

denominator += ((double) aNum.get(i) * (double) bNum.get(j));



}

}



double sqW1 = 0, sqW2 = 0; // 计算两个向量的模

for (int i = 0; i < aNum.size(); i++)

{

sqW1 += (double) aNum.get(i) * (double) aNum.get(i);

}



for (int i = 0; i < bNum.size(); i++)

{

sqW2 += (double) bNum.get(i) * (double) bNum.get(i);

}



System.out.println("余弦相似度为" + denominator / Math.sqrt(sqW1 * sqW2)); // 输出结果



}



}

PorterStemmer相关代码,将下列文字copy到记事本,然后后缀修改为java即可

package cn;





    import java.io.ByteArrayInputStream;

    import java.io.FileInputStream;

    import java.io.FileNotFoundException;

    import java.io.IOException;

    import java.io.InputStream;





    public class PorterStemmer

    {  private char[] b;

    private int i,    

                i_end,

                j, k;

    private static final int INC = 50;



    public PorterStemmer()

    {  b = new char[INC];

       i = 0;

       i_end = 0;

    }





    public void add(char ch)

    {  if (i == b.length)

       {  char[] new_b = new char[i+INC];

          for (int c = 0; c < i; c++) new_b[c] = b[c];

          b = new_b;

       }

       b[i++] = ch;

    }







    public void add(char[] w, int wLen)

    {  if (i+wLen >= b.length)

       {  char[] new_b = new char[i+wLen+INC];

          for (int c = 0; c < i; c++) new_b[c] = b[c];

          b = new_b;

       }

       for (int c = 0; c < wLen; c++) b[i++] = w[c];

    }



    public String toString() { return new String(b,0,i_end); }



    public int getResultLength() { return i_end; }



    public char[] getResultBuffer() { return b; }





    private final boolean cons(int i)

    {  switch (b[i])

       {  case 'a': case 'e': case 'i': case 'o': case 'u': return false;

          case 'y': return (i==0) ? true : !cons(i-1);

          default: return true;

       }

    }





    private final int m()

    {  int n = 0;

       int i = 0;

       while(true)

       {  if (i > j) return n;

          if (! cons(i)) break; i++;

       }

       i++;

       while(true)

       {  while(true)

          {  if (i > j) return n;

                if (cons(i)) break;

                i++;

          }

          i++;

          n++;

          while(true)

          {  if (i > j) return n;

             if (! cons(i)) break;

             i++;

          }

          i++;

        }

    }





    private final boolean vowelinstem()

    {  int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;

       return false;

    }





    private final boolean doublec(int j)

    {  if (j < 1) return false;

       if (b[j] != b[j-1]) return false;

       return cons(j);

    }





    private final boolean cvc(int i)

    {  if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;

       {  int ch = b[i];

          if (ch == 'w' || ch == 'x' || ch == 'y') return false;

       }

       return true;

    }



    private final boolean ends(String s)

    {  int l = s.length();

       int o = k-l+1;

       if (o < 0) return false;

       for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false;

       j = k-l;

       return true;

    }





    private final void setto(String s)

    {  int l = s.length();

       int o = j+1;

       for (int i = 0; i < l; i++) b[o+i] = s.charAt(i);

       k = j+l;

    }





    private final void r(String s) { if (m() > 0) setto(s); }





    private final void step1()

    {  if (b[k] == 's')

       {  if (ends("sses")) k -= 2; else

          if (ends("ies")) setto("i"); else

          if (b[k-1] != 's') k--;

       }

       if (ends("eed")) { if (m() > 0) k--; } else

       if ((ends("ed") || ends("ing")) && vowelinstem())

       {  k = j;

          if (ends("at")) setto("ate"); else

          if (ends("bl")) setto("ble"); else

          if (ends("iz")) setto("ize"); else

          if (doublec(k))

          {  k--;

             {  int ch = b[k];

                if (ch == 'l' || ch == 's' || ch == 'z') k++;

             }

          }

          else if (m() == 1 && cvc(k)) setto("e");

      }

    }





    private final void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; }





    private final void step3() { if (k == 0) return; switch (b[k-1])

    {

        case 'a': if (ends("ational")) { r("ate"); break; }

                  if (ends("tional")) { r("tion"); break; }

                  break;

        case 'c': if (ends("enci")) { r("ence"); break; }

                  if (ends("anci")) { r("ance"); break; }

                  break;

        case 'e': if (ends("izer")) { r("ize"); break; }

                  break;

        case 'l': if (ends("bli")) { r("ble"); break; }

                  if (ends("alli")) { r("al"); break; }

                  if (ends("entli")) { r("ent"); break; }

                  if (ends("eli")) { r("e"); break; }

                  if (ends("ousli")) { r("ous"); break; }

                  break;

        case 'o': if (ends("ization")) { r("ize"); break; }

                  if (ends("ation")) { r("ate"); break; }

                  if (ends("ator")) { r("ate"); break; }

                  break;

        case 's': if (ends("alism")) { r("al"); break; }

                  if (ends("iveness")) { r("ive"); break; }

                  if (ends("fulness")) { r("ful"); break; }

                  if (ends("ousness")) { r("ous"); break; }

                  break;

        case 't': if (ends("aliti")) { r("al"); break; }

                  if (ends("iviti")) { r("ive"); break; }

                  if (ends("biliti")) { r("ble"); break; }

                  break;

        case 'g': if (ends("logi")) { r("log"); break; }

    } }





    private final void step4() { switch (b[k])

    {

        case 'e': if (ends("icate")) { r("ic"); break; }

                  if (ends("ative")) { r(""); break; }

                  if (ends("alize")) { r("al"); break; }

                  break;

        case 'i': if (ends("iciti")) { r("ic"); break; }

                  break;

        case 'l': if (ends("ical")) { r("ic"); break; }

                  if (ends("ful")) { r(""); break; }

                  break;

        case 's': if (ends("ness")) { r(""); break; }

                  break;

    } }





    private final void step5()

    {   if (k == 0) return; switch (b[k-1])

        {  case 'a': if (ends("al")) break; return;

           case 'c': if (ends("ance")) break;

                     if (ends("ence")) break; return;

           case 'e': if (ends("er")) break; return;

           case 'i': if (ends("ic")) break; return;

           case 'l': if (ends("able")) break;

                     if (ends("ible")) break; return;

           case 'n': if (ends("ant")) break;

                     if (ends("ement")) break;

                     if (ends("ment")) break;



                     if (ends("ent")) break; return;

           case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;



                     if (ends("ou")) break; return;



           case 's': if (ends("ism")) break; return;

           case 't': if (ends("ate")) break;

                     if (ends("iti")) break; return;

           case 'u': if (ends("ous")) break; return;

           case 'v': if (ends("ive")) break; return;

           case 'z': if (ends("ize")) break; return;

           default: return;

        }

        if (m() > 1) k = j;

    }





    private final void step6()

    {  j = k;

       if (b[k] == 'e')

       {  int a = m();

          if (a > 1 || a == 1 && !cvc(k-1)) k--;

       }

       if (b[k] == 'l' && doublec(k) && m() > 1) k--;

    }



    public void stem()

    {  k = i - 1;

       if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }

       i_end = k+1; i = 0;

    }



    public String getStemmer(String originaltext){

    String stemtext="";

      char[] w = new char[501];

    //   Stemmer s = new Stemmer();

    //   for (int i = 0; i < args.length; i++)

    //   try

    //   {

    //      FileInputStream in = new FileInputStream(args[i]);

      InputStream in = new ByteArrayInputStream(originaltext.getBytes());     

         try

         { while(true)



           {  int ch = in.read();

              if (Character.isLetter((char) ch))

              {

                 int j = 0;

                 while(true)

                 {  ch = Character.toLowerCase((char) ch);

                    w[j] = (char) ch;

                    if (j < 500) j++;

                    ch = in.read();

                    if (!Character.isLetter((char) ch))

                    {



                       for (int c = 0; c < j; c++) this.add(w[c]);









                       this.stem();

                       {  String u;





                          u = toString();









    //                       System.out.print(u);

                          stemtext+=u;

                       }

                       break;

                    }

                 }

              }

              if (ch < 0) break;

    //           System.out.print((char)ch);

                   stemtext+=String.valueOf((char)ch);

           }

         }

         catch (IOException e){  

    //      System.out.println("error reading " + args[i]);

    //         break;

         }

    //   }

    //   catch (FileNotFoundException e)

    //   {  System.out.println("file " + args[i] + " not found");

    //      break;

    //   }

      return stemtext;



    }

    public static void main(String[] args)

    {

    PorterStemmer s = new PorterStemmer();

    // System.out.println(s.getStemmer("Test program for demonstrating the Stemmer.  It reads text from a list of files, stems each word, and writes the result to standard output. Note that the word stemmed is expected to be in lower case: forcing lower case must be done outside the Stemmer class."));

    System.out.println(s.getStemmer("parallel computer"));

    System.out.println(s.getStemmer("parallel computing"));

    System.out.println(s.getStemmer("pens"));

    System.out.println(s.getStemmer("pen"));

    }

    }

-------------本文结束 感谢您的阅读-------------
0%