import structure5.*; import java.io.FileInputStream; import java.util.Scanner; /** * The purpose of this program is to investigate hash function * quality, notably uniformity. We look at three hash functions * for the String data type. * * 1. using the ASCII value of the first letter of the string. * 2. using the ASCII value of the last letter of the string. * 3. using the sum of the ASCII values of all of the letters * in the string. * * A "good" hash function will distribute values uniformly * in an array. Since we're really only considering a-z * here, we will see how uniformly we can hash strings to * 26 buckets. A perfect hash would uniformly distribute * values to all 26 buckets. That doesn't happen with these * but one of them gets surprisingly close! */ class LetterFreq { /* This index function is the ASCII value of the * first letter mod 26. */ public static int indexLastChar(String word) { return (int)word.charAt(word.length() - 1) % 26; } /* This index function is the ASCII value of the * last letter mod 26. */ public static int indexFirstChar(String word) { return (int)word.charAt(0) % 26; } /* This index function is the sum of the ASCII values * of all the letters mod 26. */ public static int indexAllChars(String word) { int sum = 0; for (int i = 0; i < word.length(); i++) { sum += (int)word.charAt(i); } return sum % 26; } public static String makeHistogram(Map counts) { String s = ""; for (Integer key: counts.keySet()) { s += key + ": " + counts.get(key) + "\n"; } return s; } /* The purpose of this program is to explore whether * any of the following three hash functions are "good" * when hashing common (US) English words: * 1. The first letter of the word. * 2. The last letter of the word. * 3. The sum of all of the letters of the word. */ public static void main(String[] args) { Map first_counts = new Hashtable<>(); Map last_counts = new Hashtable<>(); Map sum_counts = new Hashtable<>(); try { Scanner input = new Scanner(new FileInputStream("google-10000-english-usa.txt")); while(input.hasNext()) { // get next word String word = input.next().toLowerCase(); // get last letter of word int last = indexLastChar(word); // get first letter of word int first = indexFirstChar(word); // get sum of all letters int hval = indexAllChars(word); // count first if (!first_counts.containsKey(first)) { first_counts.put(first, 0); } first_counts.put(first, first_counts.get(first) + 1); // count last if (!last_counts.containsKey(last)) { last_counts.put(last, 0); } last_counts.put(last, last_counts.get(last) + 1); // count sum if (!sum_counts.containsKey(hval)) { sum_counts.put(hval, 0); } sum_counts.put(hval, sum_counts.get(hval) + 1); } } catch (Exception e) { // quit System.out.println("Can't find input!"); System.exit(1); } // first letter distribution System.out.println("First letter as hash:"); System.out.println(makeHistogram(first_counts)); // last letter distribution System.out.println("Last letter as hash:"); System.out.println(makeHistogram(last_counts)); // sum value distribution System.out.println("Sum of letters as hash:"); System.out.println(makeHistogram(sum_counts)); } }