Archive for category Data Structures

Putting Autocomplete Data Structures to the Test

I ran a simple test to compare the Radix Tree implementation I found on the web with the Trie implementation that I coded from my last post along with a binary search. I loaded all of these structures with over 50,000 random strings, and then performed 5000 prefix searches on each. Below are the results:

5000 prefix searches took the following amount of time…
Binary Search: 6 ms (1.2 microseconds per search)
Trie Search: 38 ms (7.6 microseconds per search)
Radix Trie Search: 59 ms (11.8 microseconds per search)

Overall, they’re all pretty decent solutions. But as far as speed goes, it looks like the binary search algorithm is the winner here, with my humble Trie structure coming in second, and the Radix Trie implementation coming in last. This just goes to show you that sometimes the simplest implementations are the best.

The code that I ran my test with is shown below.

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;

import ds.tree.RadixTree;  
import ds.tree.RadixTreeImpl;

public class Main {
	Set<String> valueSet = new TreeSet<String>();
	Trie<String> trie = new TrieImpl<String>();
	RadixTree<String> radixTree = new RadixTreeImpl<String>();
	
	/**
	 * Load up the trie with lots of random strings
	 */
	public void loadTrie() {
		// Load the Trie and a Set
		for (int i = 0; i < 51000; i++) {
			String val = getRandomString();
			valueSet.add(val);
			trie.add(val, val);
		}
		
		// Load the RadixTree
		String[] strArry = new String[valueSet.size()];
		valueSet.toArray(strArry);
		for (int i = 0; i < strArry.length; i++) {
			radixTree.insert(strArry[i], strArry[i]);
		}
	}
	
	/**
	 * Test a prefix search
	 */
	public void testSearch() {
		String[] strArry = new String[valueSet.size()];
		valueSet.toArray(strArry);
		
		// Test Trie Access
		long start = System.currentTimeMillis();
		for(int i = 0; i < 5000; i++) {
			trie.search(strArry[i].substring(0, 3));
		}
		long end = System.currentTimeMillis();
		long duration = end - start;
		System.out.println("Trie Access = " + duration);
		
		// Test RadixTree Access
		start = System.currentTimeMillis();
		for(int i = 0; i < 5000; i++) {
			radixTree.searchPrefix(strArry[i].substring(0, 3), 10000);
		}
		end = System.currentTimeMillis();
		duration = end - start;
		System.out.println("RadixTrie Access = " + duration);
		
		// Test Binary Search Access
		start = System.currentTimeMillis();
		for(int i = 0; i < 5000; i++) {
			List<String> list = new ArrayList<String>();
			String prefix = strArry[i].substring(0, 3);
			int pos = Arrays.binarySearch(strArry, prefix);
			while (pos >= 0 && pos < strArry.length) {
				if (strArry[pos].startsWith(prefix)) {
					list.add(strArry[pos]);
				} else {
					break;
				}
				pos++;
			}
		}
		end = System.currentTimeMillis();
		duration = end - start;
		System.out.println("Binary Search Access = " + duration);
	}
	
	/**
	 * Returns a random string with length between
	 * 5 and 10.
	 */
	private String getRandomString() {
		StringBuilder sb = new StringBuilder();
		int len = getRandomNumber(5, 10);
		for (int i = 0; i < len; i++) {
			sb.append(getRandomChar());
		}
		
		return sb.toString();
	}
	
	/**
	 * Returns a random number between start and end
	 */
	private int getRandomNumber(int start, int end) {
		//get the range, casting to long to avoid overflow problems
		long range = (long)end - (long)start + 1;
		
		// compute a fraction of the range, 0 <= frac < range
		long fraction = (long)(range * new Random().nextDouble());
		return (int)(fraction + start); 
	}
	
	/**
	 * Returns a random letter of the alphabet.
	 */
	private char getRandomChar() {
		String alphabet="ABCDEFGHIJKLMNOPQRSTUVWXYZ";
		int pos = (int)(Math.random()*26);
		return alphabet.charAt(pos);
	}
	
    public static void main(String[] args) {
		Main main = new Main();
		main.loadTrie();
		main.testSearch();
    }
}

, ,

12 Comments

Trie Structures

As an interesting exercise, I decided to write my own Trie implementation. The code is shown below. I’ve tested it with a small amount of data, and it seems to work OK. The next step is to load it up with a large amount of data to see how it performs against the Radix Tree implementation and a binary search. I will save that for a later post.

A data structure like this would work great in conjunction with the YUI Autocomplete control.

The Trie Interface

import java.util.List;
import java.util.Set;

public interface Trie<T> {
	public void add(String key, T value);
	
	public T find(String key);
	
	public List<T> search(String prefix);
	
	public boolean contains(String key);
	
	public Set<String> getAllKeys();
	
	public int size();
}

The Trie Implementation
There’s a lot of recursive calls going on here…

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class TrieImpl<T> implements Trie<T> {
	private TrieNode<T> rootNode = new TrieNode<T>();
	
	public void add(String key, T value) {
		addNode(rootNode, key, 0, value);
	}

	public T find(String key) {
		return findKey(rootNode, key);
	}

	public List<T> search(String prefix) {
		List<T> list = new ArrayList<T>();
		
		char[] ch = prefix.toCharArray();
		TrieNode<T> node = rootNode;
		for (int i = 0; i < ch.length; i++) {
			node = node.getChildren().get(ch[i]);
			if (node == null) {
				break;
			}
		}
		
		if (node != null) {
			getValuesFromNode(node, list);
		}
		
		return list;
	}

	public boolean contains(String key) {
		return hasKey(rootNode, key);
	}
	
	public Set<String> getAllKeys() {
		Set<String> keySet = new HashSet<String>();
		getKeysFromNode(rootNode, "", keySet);
		
		return keySet;
	}
	
	public int size() {
		return getAllKeys().size();
	}
	
	private void getValuesFromNode(TrieNode<T> currNode, List<T> valueList) {
		if (currNode.isTerminal()) {
			valueList.add(currNode.getNodeValue());
		}
		
		Map<Character, TrieNode<T>> children = currNode.getChildren();
		Iterator childIter = children.keySet().iterator();
		while (childIter.hasNext()) {
			Character ch = (Character)childIter.next();
			TrieNode<T> nextNode = children.get(ch);
			getValuesFromNode(nextNode, valueList);
		}
	}
	
	private void getKeysFromNode(TrieNode<T> currNode, String key, Set keySet) {
		if (currNode.isTerminal()) {
			keySet.add(key);
		}
		
		Map<Character, TrieNode<T>> children = currNode.getChildren();
		Iterator childIter = children.keySet().iterator();
		while (childIter.hasNext()) {
			Character ch = (Character)childIter.next();
			TrieNode<T> nextNode = children.get(ch);
			String s = key + nextNode.getNodeKey();
			getKeysFromNode(nextNode, s, keySet);
		}
	}
	
	private T findKey(TrieNode<T> currNode, String key) {
		Character c = key.charAt(0);
		if (currNode.getChildren().containsKey(c)) {
			TrieNode<T> nextNode = currNode.getChildren().get(c);
			if (key.length() == 1) {
				if (nextNode.isTerminal()) {
					return nextNode.getNodeValue();
				}
			} else {
				return findKey(nextNode, key.substring(1));
			}
		}
		
		return null;
	}
	
	private boolean hasKey(TrieNode<T> currNode, String key) {
		Character c = key.charAt(0);
		if (currNode.getChildren().containsKey(c)) {
			TrieNode<T> nextNode = currNode.getChildren().get(c);
			if (key.length() == 1) {
				if (nextNode.isTerminal()) {
					return true;
				}
			} else {
				return hasKey(nextNode, key.substring(1));
			}
		}
		
		return false;
	}
	
	private void addNode(TrieNode<T> currNode, String key, int pos, T value) {
		Character c = key.charAt(pos);
		TrieNode<T> nextNode = currNode.getChildren().get(c);
		
		if (nextNode == null) {
			nextNode = new TrieNode<T>();
			nextNode.setNodeKey(c);
			if (pos < key.length() - 1) {
				addNode(nextNode, key, pos + 1, value);
			} else {
				nextNode.setNodeValue(value);
				nextNode.setTerminal(true);
			}
			currNode.getChildren().put(c, nextNode);
		} else {
			if (pos < key.length() - 1) {
				addNode(nextNode, key, pos + 1, value);
			} else {
				nextNode.setNodeValue(value);
				nextNode.setTerminal(true);
			}
		}
	}
}

The Trie Node

import java.util.HashMap;
import java.util.Map;

public class TrieNode<T> {
	private Character nodeKey;
	private T nodeValue;
	private boolean terminal;
	private Map<Character, TrieNode<T>> children = new HashMap<Character, TrieNode<T>>();
	
	public Character getNodeKey() {
		return nodeKey;
	}

	public void setNodeKey(Character nodeKey) {
		this.nodeKey = nodeKey;
	}

	public T getNodeValue() {
		return nodeValue;
	}

	public void setNodeValue(T nodeValue) {
		this.nodeValue = nodeValue;
	}
	
	public boolean isTerminal() {
		return terminal;
	}

	public void setTerminal(boolean terminal) {
		this.terminal = terminal;
	}

	public Map<Character, TrieNode<T>> getChildren() {
		return children;
	}

	public void setChildren(Map<Character, TrieNode<T>> children) {
		this.children = children;
	}
}

The Test

import junit.framework.TestCase;

public class TrieTest extends TestCase {
	Trie<Product> productTrie = new TrieImpl<Product>();
	
	public void setUp() throws Exception {
		productTrie.add("ham", new Product(1, "ham"));
		productTrie.add("hammer", new Product(2, "hammer"));
		productTrie.add("hammock", new Product(3, "hammock"));
		productTrie.add("ipod", new Product(4, "ipod"));
		productTrie.add("iphone", new Product(5, "iphone"));
	}
	
	public void testAdd() {
		assertEquals(5, productTrie.size());
	}
	
	public void testFind() {
		assertNotNull(productTrie.find("ham"));
	}
	
	public void testSearch() {
		assertEquals(3, productTrie.search("ha").size());
	}
	
	public void testContains() {
		assertEquals(true, productTrie.contains("ipod"));
	}
	
	public void testGetAllKeys() {
		assertEquals(5, productTrie.getAllKeys().size());
	}
	
	class Product {
		private int productId;
		private String productDesc;
		
		public Product(int productId, String productDesc) {
			this.productId = productId;
			this.productDesc = productDesc;
		}
		
		public int getProductId() {
			return productId;
		}
		public void setProductId(int productId) {
			this.productId = productId;
		}
		public String getProductDesc() {
			return productDesc;
		}
		public void setProductDesc(String productDesc) {
			this.productDesc = productDesc;
		}
		
	}
}

,

1 Comment

Autocomplete Data Structures

When impelementing autocomplete functionality on your website, for fast and efficient lookup of user-entered text, you have a few options. If the amount of data to be used in your autocomplete implementation is not too large, you can send it all at once to the front-end and cache it in the browser. But many times, this may not be possible (e.g. if you have thousands or even millions of rows of text that the user can search on, such as sites like Google or Amazon).

In cases like these, you will need to pull the data from the server. You can pull the data from the database each time, but performance would not be very good. A better solution would be to cache the data on the server in some kind of data structure that gives optimal lookup performance. So as the user is typing, the round-trip to the server via an AJAX call would bring back the data quickly and efficiently.
One such possible structure is the Trie or Prefix Tree. This structure is described in Wikipedia as:
In computer science, a trie, or prefix tree, is an ordered tree data structure that is used to store an associative array where the keys are usually strings. Unlike a binary search tree, no node in the tree stores the key associated with that node; instead, its position in the tree shows what key it is associated with. All the descendants of a node have a common prefix of the string associated with that node, and the root is associated with the empty string. Values are normally not associated with every node, only with leaves and some inner nodes that correspond to keys of interest.
A more specific implementation of the Trie structure is the Radix Tree, Patricia trie/tree, or crit bit tree. The Radix Tree is a data structure that provides for fast searching of prefixes. A nice implementation of this structure was created by Tahseen Ur Rehman and can be found here: http://code.google.com/p/radixtree/.

In cases like these, you will need to pull the data from the server. You can pull the data from the database each time, but performance for this would not be good. A better solution would be to cache the data on the server in some kind of data structure that gives optimal lookup performance. So as the user is typing, the round-trip to the server via an AJAX call would bring back the data quickly and efficiently.

One such possible structure is the Trie or Prefix Tree. This structure is described in Wikipedia as:

In computer science, a trie, or prefix tree, is an ordered tree data structure that is used to store an associative array where the keys are usually strings. Unlike a binary search tree, no node in the tree stores the key associated with that node; instead, its position in the tree shows what key it is associated with. All the descendants of a node have a common prefix of the string associated with that node, and the root is associated with the empty string. Values are normally not associated with every node, only with leaves and some inner nodes that correspond to keys of interest.

A more specific implementation of the Trie structure is the Radix Tree, Patricia trie/tree, or crit bit tree. The Radix Tree is a data structure that provides for fast and efficient searching of prefixes. A good implementation of this structure was created by Tahseen Ur Rehman and can be found here: http://code.google.com/p/radixtree/. The nice thing about this implementation is that you can store all of your search text as keys and any other type of object as the values associated with those keys.

For example, suppose you have an online store and want to allow the user to search the entire list of products that you sell. You can cache the list of product names along with some detailed product information into your Radix Tree on the server side. You might also want to load the product info on application startup and periodically refresh the content.

Below is an implementation of a Factory class that is used to retrieve our Radix tree. It is synchronized to handle refreshing the tree by a separate thread running in Spring.


import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import ds.tree.RadixTree;
import ds.tree.RadixTreeImpl;

/**
 * Factory class for creating the tree data structure 
 * used for autocomplete lookup of products.
 *
 */ 

 public class ProductTreeFactory {
    private static Log log = LogFactory.getLog(ProductTreeFactory.class);

    private RadixTree<Product> productTree = new RadixTreeImpl<Product>();
    private ProductLookupDao productLookupDao;

   /**
    * Sets the ProductLookupDao.
    *
    * @param productLookupDao The DAO
    */
   public void setProductLookupDao(ProductLookupDao productLookupDao) {
      this.productLookupDao = productLookupDao;
   }

   /**
    * Returns the tree object containing products.
    *
    * @return The product tree
    */
   public synchronized RadixTree<Product> getProductTree() {
      return productTree;
   }

   /**
    * Loads the tree with product data from the database.
    */
   public synchronized void loadTree() {
      log.info("Loading product data into tree");

      productTree = new RadixTreeImpl<Product>();
      List<Product> products = productLookupDao.findAllProducts();

      for (Product product : products) {
         productTree.insert(product.getProductName(), product);
      }
   }
}

The Spring config below uses the quartz scheduler to refresh the tree once an hour.

    <!-- Scheduled Jobs -->
    <bean id="reloadTreeJob"
                    class="org.springframework.scheduling.quartz.MethodInvokingJobDetailFactoryBean">
               <property name="targetObject" ref="productTreeFactory" />
               <property name="targetMethod" value="loadTree" />
    </bean>

    <bean id="reloadTreeTrigger" class="org.springframework.scheduling.quartz.SimpleTriggerBean">
       <!-- see the example of method invoking job above -->
       <property name="jobDetail" ref="reloadTreeJob" />
        <!-- repeat every hour -->
        <property name="repeatInterval" value="3600000" />
    </bean>

    <!-- Quartz Scheduler -->
    <bean class="org.springframework.scheduling.quartz.SchedulerFactoryBean">
           <property name="triggers">
                   <list>
                           <ref bean="reloadTreeTrigger" />
                   </list>
           </property>
    </bean>

    <!-- Utility Classes -->
    <bean id="productTreeFactory" class="myapp.ProductTreeFactory" init-method="loadTree">
            <property name="productLookupDao" ref="productLookupDao"/>
    </bean>

 

Some other options that I found for caching data on the server are using a simple TreeSet or an embedded database such as HSQLDB or the H2 database. However, these options are not as fast or efficient as the Trie structure.

Update: I ran some timings on different structures for autocomplete and found a surprising result.

8 Comments

Follow

Get every new post delivered to your Inbox.