Fixed tokenization issue with special chars on Mac.

git-svn-id: https://hyracks.googlecode.com/svn/trunk@603 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
index 41b63e7..b31205f 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
@@ -49,6 +49,36 @@
     private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
     private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
 
+    private boolean isSeparator(char c) {
+        return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+    }
+    
+    private void tokenize(String text, ArrayList<String> tokens) {
+    	String lowerCaseText = text.toLowerCase();
+    	int startIx = 0;
+    	
+    	// Skip separators at beginning of string.
+    	while(isSeparator(lowerCaseText.charAt(startIx))) {
+    		startIx++;
+    	}
+    	while(startIx < lowerCaseText.length()) {
+    		while(startIx < lowerCaseText.length() && isSeparator(lowerCaseText.charAt(startIx))) {
+        	    startIx++;
+        	}
+    		int tokenStart = startIx;
+    		
+    		while(startIx < lowerCaseText.length() && !isSeparator(lowerCaseText.charAt(startIx))) {
+        	    startIx++;
+        	}
+    		int tokenEnd = startIx;
+    		
+    		// Emit token.
+    		String token = lowerCaseText.substring(tokenStart, tokenEnd);
+    		
+    		tokens.add(token);
+    	}
+    }
+    
     @Before
     public void init() throws IOException {
         // serialize text into bytes
@@ -56,27 +86,10 @@
         DataOutput dos = new DataOutputStream(baos);
         dos.writeUTF(text);
         inputBuffer = baos.toByteArray();
-
+        
         // init expected string tokens
-        expectedUTF8Tokens.add("hello");
-        expectedUTF8Tokens.add("world");
-        expectedUTF8Tokens.add("i");
-        expectedUTF8Tokens.add("would");
-        expectedUTF8Tokens.add("like");
-        expectedUTF8Tokens.add("to");
-        expectedUTF8Tokens.add("inform");
-        expectedUTF8Tokens.add("you");
-        expectedUTF8Tokens.add("of");
-        expectedUTF8Tokens.add("the");
-        expectedUTF8Tokens.add("importance");
-        expectedUTF8Tokens.add("of");
-        expectedUTF8Tokens.add("foo");
-        expectedUTF8Tokens.add("bar");
-        expectedUTF8Tokens.add("yes");
-        expectedUTF8Tokens.add("foo");
-        expectedUTF8Tokens.add("bar");
-        expectedUTF8Tokens.add("jürgen");
-
+        tokenize(text, expectedUTF8Tokens);
+        
         // hashed tokens ignoring token count
         for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
             int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
@@ -206,9 +219,11 @@
     public int tokenHash(String token, int tokenCount) {
         int h = AbstractUTF8Token.GOLDEN_RATIO_32;
         for (int i = 0; i < token.length(); i++) {
-            h ^= token.charAt(i);
+            System.out.print((int)token.charAt(i) + " ");
+        	h ^= token.charAt(i);
             h *= AbstractUTF8Token.GOLDEN_RATIO_32;
         }
+        System.out.println("CHK");
         return h + tokenCount;
     }
 }