Manually applied r603 and r605 from trunk.

git-svn-id: https://hyracks.googlecode.com/svn/branches/hyracks_dev_next@838 123451ca-8445-de46-9d55-352943316053
diff --git a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
index 57fe306..53fb96d 100644
--- a/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
+++ b/hyracks-tests/hyracks-storage-am-invertedindex-test/src/test/java/edu/uci/ics/hyracks/storage/am/invertedindex/WordTokenizerTest.java
@@ -42,181 +42,180 @@
 
 public class WordTokenizerTest {
 
-	private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
-	private byte[] inputBuffer;
+    private String text = "Hello World, I would like to inform you of the importance of Foo Bar. Yes, Foo Bar. Jürgen.";
+    private byte[] inputBuffer;
 
-	private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
-	private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
-	private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
+    private ArrayList<String> expectedUTF8Tokens = new ArrayList<String>();
+    private ArrayList<Integer> expectedHashedUTF8Tokens = new ArrayList<Integer>();
+    private ArrayList<Integer> expectedCountedHashedUTF8Tokens = new ArrayList<Integer>();
 
-	@Before
-	public void init() throws IOException {
-		// serialize text into bytes
-		ByteArrayOutputStream baos = new ByteArrayOutputStream();
-		DataOutput dos = new DataOutputStream(baos);
-		dos.writeUTF(text);
-		inputBuffer = baos.toByteArray();
+    private boolean isSeparator(char c) {
+        return !(Character.isLetterOrDigit(c) || Character.getType(c) == Character.OTHER_LETTER || Character.getType(c) == Character.OTHER_NUMBER);
+    }
+    
+    private void tokenize(String text, ArrayList<String> tokens) {
+    	String lowerCaseText = text.toLowerCase();
+    	int startIx = 0;
+    	
+    	// Skip separators at beginning of string.
+    	while(isSeparator(lowerCaseText.charAt(startIx))) {
+    		startIx++;
+    	}
+    	while(startIx < lowerCaseText.length()) {
+    		while(startIx < lowerCaseText.length() && isSeparator(lowerCaseText.charAt(startIx))) {
+        	    startIx++;
+        	}
+    		int tokenStart = startIx;
+    		
+    		while(startIx < lowerCaseText.length() && !isSeparator(lowerCaseText.charAt(startIx))) {
+        	    startIx++;
+        	}
+    		int tokenEnd = startIx;
+    		
+    		// Emit token.
+    		String token = lowerCaseText.substring(tokenStart, tokenEnd);
+    		
+    		tokens.add(token);
+    	}
+    }
+    
+    @Before
+    public void init() throws IOException {
+        // serialize text into bytes
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        DataOutput dos = new DataOutputStream(baos);
+        dos.writeUTF(text);
+        inputBuffer = baos.toByteArray();
+        
+        // init expected string tokens
+        tokenize(text, expectedUTF8Tokens);
+        
+        // hashed tokens ignoring token count
+        for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+            int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
+            expectedHashedUTF8Tokens.add(hash);
+        }
 
-		// init expected string tokens
-		expectedUTF8Tokens.add("hello");
-		expectedUTF8Tokens.add("world");
-		expectedUTF8Tokens.add("i");
-		expectedUTF8Tokens.add("would");
-		expectedUTF8Tokens.add("like");
-		expectedUTF8Tokens.add("to");
-		expectedUTF8Tokens.add("inform");
-		expectedUTF8Tokens.add("you");
-		expectedUTF8Tokens.add("of");
-		expectedUTF8Tokens.add("the");
-		expectedUTF8Tokens.add("importance");
-		expectedUTF8Tokens.add("of");
-		expectedUTF8Tokens.add("foo");
-		expectedUTF8Tokens.add("bar");
-		expectedUTF8Tokens.add("yes");
-		expectedUTF8Tokens.add("foo");
-		expectedUTF8Tokens.add("bar");
-		expectedUTF8Tokens.add("jürgen");
+        // hashed tokens using token count
+        HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
+        for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
+            Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
+            if (count == null) {
+                count = 1;
+                tokenCounts.put(expectedUTF8Tokens.get(i), count);
+            } else {
+                count++;
+            }
 
-		// hashed tokens ignoring token count
-		for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
-			int hash = tokenHash(expectedUTF8Tokens.get(i), 1);
-			expectedHashedUTF8Tokens.add(hash);
-		}
+            int hash = tokenHash(expectedUTF8Tokens.get(i), count);
+            expectedCountedHashedUTF8Tokens.add(hash);
+        }
+    }
 
-		// hashed tokens using token count
-		HashMap<String, Integer> tokenCounts = new HashMap<String, Integer>();
-		for (int i = 0; i < expectedUTF8Tokens.size(); i++) {
-			Integer count = tokenCounts.get(expectedUTF8Tokens.get(i));
-			if (count == null) {
-				count = 1;
-				tokenCounts.put(expectedUTF8Tokens.get(i), count);
-			} else {
-				count++;
-			}
+    @Test
+    public void testWordTokenizerWithCountedHashedUTF8Tokens() throws IOException {
 
-			int hash = tokenHash(expectedUTF8Tokens.get(i), count);
-			expectedCountedHashedUTF8Tokens.add(hash);
-		}
-	}
+        HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(false, false,
+                tokenFactory);
 
-	@Test
-	public void testWordTokenizerWithCountedHashedUTF8Tokens()
-			throws IOException {
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
 
-		HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
-		DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(
-				false, false, tokenFactory);
+        int tokenCount = 0;
 
-		tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
 
-		int tokenCount = 0;
+            // serialize token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
 
-		while (tokenizer.hasNext()) {
-			tokenizer.next();
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
 
-			// serialize token
-			ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
-			DataOutput tokenDos = new DataOutputStream(tokenBaos);
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
 
-			IToken token = tokenizer.getToken();
-			token.serializeToken(tokenDos);
+            Integer hashedToken = in.readInt();
 
-			// deserialize token
-			ByteArrayInputStream bais = new ByteArrayInputStream(
-					tokenBaos.toByteArray());
-			DataInput in = new DataInputStream(bais);
+            Assert.assertEquals(hashedToken, expectedCountedHashedUTF8Tokens.get(tokenCount));
 
-			Integer hashedToken = in.readInt();
+            tokenCount++;
+        }
+    }
 
-			// System.out.println(hashedToken);
+    @Test
+    public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
 
-			Assert.assertEquals(hashedToken,
-					expectedCountedHashedUTF8Tokens.get(tokenCount));
+        HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
 
-			tokenCount++;
-		}
-	}
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
 
-	@Test
-	public void testWordTokenizerWithHashedUTF8Tokens() throws IOException {
+        int tokenCount = 0;
 
-		HashedUTF8WordTokenFactory tokenFactory = new HashedUTF8WordTokenFactory();
-		DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(
-				true, false, tokenFactory);
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
 
-		tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+            // serialize token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
 
-		int tokenCount = 0;
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
 
-		while (tokenizer.hasNext()) {
-			tokenizer.next();
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
 
-			// serialize token
-			ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
-			DataOutput tokenDos = new DataOutputStream(tokenBaos);
+            Integer hashedToken = in.readInt();
 
-			IToken token = tokenizer.getToken();
-			token.serializeToken(tokenDos);
+            Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount), hashedToken);
 
-			// deserialize token
-			ByteArrayInputStream bais = new ByteArrayInputStream(
-					tokenBaos.toByteArray());
-			DataInput in = new DataInputStream(bais);
+            tokenCount++;
+        }
+    }
 
-			Integer hashedToken = in.readInt();
+    @Test
+    public void testWordTokenizerWithUTF8Tokens() throws IOException {
 
-			// System.out.println(hashedToken);
+        UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
+        DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(true, false, tokenFactory);
 
-			Assert.assertEquals(expectedHashedUTF8Tokens.get(tokenCount),
-					hashedToken);
+        tokenizer.reset(inputBuffer, 0, inputBuffer.length);
 
-			tokenCount++;
-		}
-	}
+        int tokenCount = 0;
 
-	@Test
-	public void testWordTokenizerWithUTF8Tokens() throws IOException {
+        while (tokenizer.hasNext()) {
+            tokenizer.next();
 
-		UTF8WordTokenFactory tokenFactory = new UTF8WordTokenFactory();
-		DelimitedUTF8StringBinaryTokenizer tokenizer = new DelimitedUTF8StringBinaryTokenizer(
-				true, false, tokenFactory);
+            // serialize hashed token
+            ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
+            DataOutput tokenDos = new DataOutputStream(tokenBaos);
 
-		tokenizer.reset(inputBuffer, 0, inputBuffer.length);
+            IToken token = tokenizer.getToken();
+            token.serializeToken(tokenDos);
 
-		int tokenCount = 0;
+            // deserialize token
+            ByteArrayInputStream bais = new ByteArrayInputStream(tokenBaos.toByteArray());
+            DataInput in = new DataInputStream(bais);
 
-		while (tokenizer.hasNext()) {
-			tokenizer.next();
+            String strToken = in.readUTF();
 
-			// serialize hashed token
-			ByteArrayOutputStream tokenBaos = new ByteArrayOutputStream();
-			DataOutput tokenDos = new DataOutputStream(tokenBaos);
+            Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
 
-			IToken token = tokenizer.getToken();
-			token.serializeToken(tokenDos);
+            tokenCount++;
+        }
+    }
 
-			// deserialize token
-			ByteArrayInputStream bais = new ByteArrayInputStream(
-					tokenBaos.toByteArray());
-			DataInput in = new DataInputStream(bais);
-
-			String strToken = in.readUTF();
-
-			// System.out.println(strToken);
-
-			Assert.assertEquals(expectedUTF8Tokens.get(tokenCount), strToken);
-
-			tokenCount++;
-		}
-	}
-
-	// JAQL
-	public int tokenHash(String token, int tokenCount) {
-		int h = AbstractUTF8Token.GOLDEN_RATIO_32;
-		for (int i = 0; i < token.length(); i++) {
-			h ^= token.charAt(i);
-			h *= AbstractUTF8Token.GOLDEN_RATIO_32;
-		}
-		return h + tokenCount;
-	}
+    // JAQL Hash
+    public int tokenHash(String token, int tokenCount) {
+        int h = AbstractUTF8Token.GOLDEN_RATIO_32;
+        for (int i = 0; i < token.length(); i++) {
+        	h ^= token.charAt(i);
+            h *= AbstractUTF8Token.GOLDEN_RATIO_32;
+        }
+        return h + tokenCount;
+    }
 }