Functional Scope (In-Scope)
- Trie Prefix Autocomplete: Build efficient Trie prefix indexes returning top-5 matching keywords.
- Inverted Posting Index: Map individual word terms to document IDs and occurrence frequencies.
- Dynamic Ranking Strategies (Strategy Pattern): Support switching algorithms (TF-IDF vs Simple term occurrence counts) dynamically.
- AND Intersection Queries: Perform sorted posting lists intersections in O(N+M) complexity.
Explicit Boundaries (Out-of-Scope)
- No Intelligent NLP Synonym Parsing: Excludes lemmatization, semantic word embedding checks, or multilingual parsers.
- No Real crawling System: Bypasses web page scrapers, HTML parsers, or outbound HTTP fetching pipelines.
Robust reference designs demonstrating Trie prefixes and TF-IDF posting in Java and Python:
// ─── JAVA BLUEPRINT ──────────────────────────────────────────────────────────
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.locks.*;
class TrieNode {
private final Map<Character, TrieNode> children = new ConcurrentHashMap<>();
private final List<String> suggestions = new CopyOnWriteArrayList<>();
private final ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock();
public Map<Character, TrieNode> getChildren() { return children; }
public List<String> getSuggestions() {
rwLock.readLock().lock();
try {
return new ArrayList<>(suggestions);
} finally {
rwLock.readLock().unlock();
}
}
public void addSuggestion(String word) {
rwLock.writeLock().lock();
try {
if (!suggestions.contains(word)) {
suggestions.add(word);
// Sort suggestions by length to simulate relevance
suggestions.sort(Comparator.comparingInt(String::length));
if (suggestions.size() > 5) {
suggestions.remove(suggestions.size() - 1);
}
}
} finally {
rwLock.writeLock().unlock();
}
}
}
class Trie {
private final TrieNode root = new TrieNode();
public void insert(String word) {
TrieNode curr = root;
for (char c : word.toLowerCase().toCharArray()) {
curr = curr.getChildren().computeIfAbsent(c, k -> new TrieNode());
curr.addSuggestion(word);
}
}
public List<String> getSuggestions(String prefix) {
TrieNode curr = root;
for (char c : prefix.toLowerCase().toCharArray()) {
curr = curr.getChildren().get(c);
if (curr == null) return Collections.emptyList();
}
return curr.getSuggestions();
}
}
class Posting {
private final String docId;
private final int termFrequency;
public Posting(String docId, int termFrequency) {
this.docId = docId;
this.termFrequency = termFrequency;
}
public String getDocId() { return docId; }
public int getTermFrequency() { return termFrequency; }
}
class SearchResult {
private final String docId;
private final double score;
public SearchResult(String docId, double score) {
this.docId = docId;
this.score = score;
}
public String getDocId() { return docId; }
public double getScore() { return score; }
}
// ─── STRATEGY PATTERN (RANKING) ──────────────────────────────────────────────
interface RankingStrategy {
List<SearchResult> rank(List<String> terms, Map<String, List<Posting>> index, int totalDocuments);
}
class TFIDFRankingStrategy implements RankingStrategy {
@Override
public List<SearchResult> rank(List<String> terms, Map<String, List<Posting>> index, int totalDocs) {
Map<String, Double> scores = new HashMap<>();
for (String term : terms) {
List<Posting> postings = index.getOrDefault(term.toLowerCase(), Collections.emptyList());
if (postings.isEmpty()) continue;
// IDF = ln(1 + totalDocs / documentFrequency)
double idf = Math.log(1.0 + (double) totalDocs / postings.size());
for (Posting posting : postings) {
// TF-IDF = tf * idf
double tfIdf = posting.getTermFrequency() * idf;
scores.put(posting.getDocId(), scores.getOrDefault(posting.getDocId(), 0.0) + tfIdf);
}
}
List<SearchResult> results = new ArrayList<>();
for (Map.Entry<String, Double> entry : scores.entrySet()) {
results.add(new SearchResult(entry.getKey(), entry.getValue()));
}
results.sort((a, b) -> Double.compare(b.getScore(), a.getScore()));
return results;
}
}
class SimpleFrequencyRankingStrategy implements RankingStrategy {
@Override
public List<SearchResult> rank(List<String> terms, Map<String, List<Posting>> index, int totalDocs) {
Map<String, Integer> scores = new HashMap<>();
for (String term : terms) {
List<Posting> postings = index.getOrDefault(term.toLowerCase(), Collections.emptyList());
for (Posting p : postings) {
scores.put(p.getDocId(), scores.getOrDefault(p.getDocId(), 0) + p.getTermFrequency());
}
}
List<SearchResult> results = new ArrayList<>();
for (Map.Entry<String, Integer> entry : scores.entrySet()) {
results.add(new SearchResult(entry.getKey(), entry.getValue()));
}
results.sort((a, b) -> Double.compare(b.getScore(), a.getScore()));
return results;
}
}
class SearchEngine {
private final Trie autocompleteTrie = new Trie();
private final Map<String, List<Posting>> invertedIndex = new ConcurrentHashMap<>();
private final Set<String> registeredDocIds = ConcurrentHashMap.newKeySet();
private RankingStrategy rankingStrategy;
public SearchEngine(RankingStrategy rankingStrategy) {
this.rankingStrategy = rankingStrategy;
}
public void setRankingStrategy(RankingStrategy rankingStrategy) {
this.rankingStrategy = rankingStrategy;
}
public void indexDocument(String docId, String content) {
registeredDocIds.add(docId);
String[] words = content.toLowerCase().split("\\W+");
Map<String, Integer> freqs = new HashMap<>();
for (String word : words) {
if (word.trim().isEmpty()) continue;
freqs.put(word, freqs.getOrDefault(word, 0) + 1);
autocompleteTrie.insert(word);
}
for (Map.Entry<String, Integer> entry : freqs.entrySet()) {
String word = entry.getKey();
int freq = entry.getValue();
invertedIndex.compute(word, (k, postings) -> {
if (postings == null) postings = new CopyOnWriteArrayList<>();
postings.add(new Posting(docId, freq));
return postings;
});
}
}
public List<String> autocomplete(String prefix) {
return autocompleteTrie.getSuggestions(prefix);
}
public List<SearchResult> search(String query) {
// Simple space-separated AND query resolver
String[] terms = query.toLowerCase().split("\\s+");
List<String> queryTerms = Arrays.asList(terms);
return rankingStrategy.rank(queryTerms, invertedIndex, registeredDocIds.size());
}
}
public class Main {
public static void main(String[] args) {
System.out.println("=== Search Engine with Trie ===");
SearchEngine engine = new SearchEngine(new TFIDFRankingStrategy());
engine.indexDocument("DOC-1", "Concurrency in Java is highly complex and beautiful.");
engine.indexDocument("DOC-2", "Java offers thread-safe collections and safe locks.");
engine.indexDocument("DOC-3", "Trie autocomplete supports real-time fast suggestions.");
// Autocomplete
System.out.println("Autocomplete for 'con': " + engine.autocomplete("con"));
System.out.println("Autocomplete for 'ja': " + engine.autocomplete("ja"));
// Search with TF-IDF Strategy
System.out.println("--- Search Results for 'Java' (TF-IDF) ---");
for (SearchResult res : engine.search("Java")) {
System.out.println(String.format("Doc ID: %s, Score: %.4f", res.getDocId(), res.getScore()));
}
// Change Strategy to simple frequency
engine.setRankingStrategy(new SimpleFrequencyRankingStrategy());
System.out.println("--- Search Results for 'Java' (Simple Frequency) ---");
for (SearchResult res : engine.search("Java")) {
System.out.println(String.format("Doc ID: %s, Score: %.4f", res.getDocId(), res.getScore()));
}
}
}