// ------------------------------------------------------------
// SearchEngine.java
//   An example of using IntegerSets in Information Retrieval
//   for 605.201, Introduction to Programming Using Java
//
// Paul McNamee  (7/14/98)
// ------------------------------------------------------------

import java.util.*;

// ------------------------------------------------------------
// This class is only used here in this file, so it is not public.
// A document is an set of integers where the each integer corresponds
// to a word (term) in the Boolean vector space.

class Document {
  public IntegerSet set;

  public Document (String s) {
    set = new IntegerSet();
    StringTokenizer st = new StringTokenizer(s, " \t\n");
    while (st.hasMoreTokens()) {
      int index = SearchEngine.wordToNumber(st.nextToken().toLowerCase());
      if (index > 0)
        set.insertElement(index);
    }
  }
}

// ------------------------------------------------------------
public class SearchEngine {
  // A list of the words in the vector space
  static String [] dictionary = {
    "books", "library", "java", "computer", "science", "exam",
    "phone", "dog", "cat", "money", "vacation", "hawaii", "acadia",
    "national", "park", "driving", "car", "the", "algorithm"};

  // ------------------------------------------------------------
  // compare two documents (higher numbers are more similar)
  public static int similarity (Document a, Document b) {
    // number of words in common
    return a.set.intersectionOfIntegerSets(b.set).cardinality();
  }

  // ------------------------------------------------------------
  // convert a word to a corresponding index into the dictionary
  // if a words isn't found in our dictionary, return -1 instead.
  public static int wordToNumber (String s) {
    for(int i=0; i<dictionary.length; i++) {
      if (s.equalsIgnoreCase(dictionary[i]))
        return i;
    }
    return -1;  // word not found
  }

  // ------------------------------------------------------------
  // Take a query and compare it against an array of documents.  A
  // better subclass would start a graphical window, and show you the
  // text of the documents.  I'm not trying to write Netscape...

  public static void scoreDocsForQuery(Document [] pages, String s) {
    Document query = new Document(s);
    // Search all web pages (documents) and find the 'closest' one
    System.out.println("For Query: [" + s + "]");
    for (int i=0; i<pages.length; i++) {
      System.out.println("\tPage " + i +
                         " is scored: " + similarity(query, pages[i]) );
    }
  }


  // ------------------------------------------------------------
  // A test program that builds a database of 'Documents' and then runs
  // several queries against those documents.

  public static void main (String [] args) {
    Document [] webPages = new Document[5];
    webPages[0] = new Document("Java books in the computer science library");
    webPages[1] = new Document("Vacation in Acadia National Park");
    webPages[2] = new Document("You must be able to park a car in order to" +
                               "pass the driver's test");
    webPages[3] = new Document("Some dogs don't like cats");
    webPages[4] = new Document("Vacation plans in Hawaii");
    
    scoreDocsForQuery(webPages, "Is my favorite vacation spot Acadia or Java");
    scoreDocsForQuery(webPages, "Can you parallel park a car");
    scoreDocsForQuery(webPages, "Real computer scientists program in Java");
  }
}


// paulmac@nautilus: java SearchEngine
// For Query: [Is my favorite vacation spot Acadia or Java]
//         Page 0 is scored: 1
//         Page 1 is scored: 2
//         Page 2 is scored: 0
//         Page 3 is scored: 0
//         Page 4 is scored: 1
// For Query: [Can you parallel park a car]
//         Page 0 is scored: 0
//         Page 1 is scored: 1
//         Page 2 is scored: 2
//         Page 3 is scored: 0
//         Page 4 is scored: 0
// For Query: [Real computer scientists program in Java]
//         Page 0 is scored: 2
//         Page 1 is scored: 0
//         Page 2 is scored: 0
//         Page 3 is scored: 0
//         Page 4 is scored: 0
// paulmac@nautilus: 
