Fixed tokenization issue with special chars on Mac. git-svn-id: https://hyracks.googlecode.com/svn/trunk@603 123451ca-8445-de46-9d55-352943316053

commit: 2cb3c2e21d5273e65bab265db3d32ab30a5561bf [log] [tgz]
author: alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053> Fri Oct 07 02:10:19 2011 +0000
committer: alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053> Fri Oct 07 02:10:19 2011 +0000
tree: 4b64673b913ee6c671b9e317f1ec22de7f6cf250
parent: dcd99d13414c900d8e5f86e3f62ed7567d6edd68 [diff]
diff --git a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
index 41b63e7..b31205f 100644
--- a/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
+++ b/hyracks/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java

@@ -49,6 +49,36 @@
     private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
     private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
 
+    private boolean isSeparator(char c) {
+        return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+    }
+    
+    private void tokenize(String text, ArrayList<String> tokens) {
+    	String lowerCaseText = text.toLowerCase();
+    	int startIx = 0;
+    	
+    	// Skip separators at beginning of string.
+    	while(isSeparator(lowerCaseText.charAt(startIx))) {
+    		startIx++;
+    	}
+    	while(startIx < lowerCaseText.length()) {
+    		while(startIx < lowerCaseText.length() && isSeparator(lowerCaseText.charAt(startIx))) {
+        	    startIx++;
+        	}
+    		int tokenStart = startIx;
+    		
+    		while(startIx < lowerCaseText.length() && !isSeparator(lowerCaseText.charAt(startIx))) {
+        	    startIx++;
+        	}
+    		int tokenEnd = startIx;
+    		
+    		// Emit token.
+    		String token = lowerCaseText.substring(tokenStart, tokenEnd);
+    		
+    		tokens.add(token);
+    	}
+    }
+    
     @Before
     public void init() throws IOException {
         // serialize text into bytes
@@ -56,27 +86,10 @@
         DataOutput dos = new DataOutputStream(baos);
         dos.writeUTF(text);
         inputBuffer = baos.toByteArray();
-
+        
         // init expected string tokens
-        expectedUTF8Tokens.add("hello");
-        expectedUTF8Tokens.add("world");
-        expectedUTF8Tokens.add("i");
-        expectedUTF8Tokens.add("would");
-        expectedUTF8Tokens.add("like");
-        expectedUTF8Tokens.add("to");
-        expectedUTF8Tokens.add("inform");
-        expectedUTF8Tokens.add("you");
-        expectedUTF8Tokens.add("of");
-        expectedUTF8Tokens.add("the");
-        expectedUTF8Tokens.add("importance");
-        expectedUTF8Tokens.add("of");
-        expectedUTF8Tokens.add("foo");
-        expectedUTF8Tokens.add("bar");
-        expectedUTF8Tokens.add("yes");
-        expectedUTF8Tokens.add("foo");
-        expectedUTF8Tokens.add("bar");
-        expectedUTF8Tokens.add("jürgen");
-
+        tokenize(text, expectedUTF8Tokens);
+        
         // hashed tokens ignoring token count
         for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
             int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
@@ -206,9 +219,11 @@
     public int tokenHash(String token, int tokenCount) {
         int h = AbstractUTF8Token.GOLDEN_RATIO_32;
         for (int i = 0; i < token.length(); i++) {
-            h ^= token.charAt(i);
+            System.out.print((int)token.charAt(i) + " ");
+        	h ^= token.charAt(i);
             h *= AbstractUTF8Token.GOLDEN_RATIO_32;
         }
+        System.out.println("CHK");
         return h + tokenCount;
     }
 }
commit	2cb3c2e21d5273e65bab265db3d32ab30a5561bf	[log] [tgz]
author	alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053>	Fri Oct 07 02:10:19 2011 +0000
committer	alexander.behm <alexander.behm@123451ca-8445-de46-9d55-352943316053>	Fri Oct 07 02:10:19 2011 +0000
tree	4b64673b913ee6c671b9e317f1ec22de7f6cf250
parent	dcd99d13414c900d8e5f86e3f62ed7567d6edd68 [diff]