-
Notifications
You must be signed in to change notification settings - Fork 0
/
WordTree.cs
189 lines (159 loc) · 5.7 KB
/
WordTree.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Grammophone.Indexing
{
/// <summary>
/// A radix tree containing words of
/// character type <typeparamref name="C"/>.
/// </summary>
/// <typeparam name="C">The type of the character.</typeparam>
/// <typeparam name="D">The type of information allocated per word.</typeparam>
/// <typeparam name="N">The type of extra data stored per node.</typeparam>
[Serializable]
public class WordTree<C, D, N> : RadixTree<C, D, N>
{
#region Construction
/// <summary>
/// Create.
/// </summary>
/// <param name="wordItemProcessor">The optional word item processor to use during word addition.</param>
public WordTree(WordItemProcessor<C, D, N> wordItemProcessor = null)
: base(wordItemProcessor)
{
}
#endregion
#region Public methods
/// <summary>
/// Add a word to the tree.
/// </summary>
/// <param name="word">The word to add.</param>
/// <param name="wordItem">The info associated with the word.</param>
/// <remarks>
/// In order for the tree to be non-implicit, ie. to have a leaf for each word, the word
/// must have as its last 'character' a character that doesn't belong to the
/// 'character set' or 'alphabet'. This is frequently called a sentinel, or a termination
/// special character, and is usually depicted in various papers as '$'.
/// Time and space complexity: O(<paramref name="word"/>.Length).
/// </remarks>
public override void AddWord(C[] word, D wordItem)
{
// Search for the longest available common match
// betwen the word and the tree contents.
var searchResult = this.LongestCommonPrefixSearch(word);
// Did we find any significant match?
// Equivalently, is the found branch other than the root?
var branch = searchResult.Branch;
if (branch != this.Root)
{
// Yes, at least a common prefix was found.
// Was the whole word found?
if (searchResult.Match.Length == word.Length)
{
// Yes, the whole word was matched.
// Does the match terminate at branch boundary?
if (searchResult.MatchEndOffset != branch.Length)
{
// No, so there is a need to create an explicit node by
// breaking the branch in two.
branch = searchResult.Branch.Split(searchResult.MatchEndOffset);
}
// Add the information that corresponds to the given word.
wordItemProcessor.OnWordAdd(word, wordItem, branch);
}
else
{
// No, just part of the word was matched against the existing
// contents of the tree.
// Does the match terminate at branch boundary?
if (searchResult.MatchEndOffset != branch.Length)
{
// No, so there is a need to create an explicit node
// breaking the branch in two.
branch = searchResult.Branch.Split(searchResult.MatchEndOffset);
}
AddBranch(word, wordItem, branch, branch.StartIndex + searchResult.MatchEndOffset);
}
}
else
{
// No, there is no match with existing content. Append a branch at root.
AddBranch(word, wordItem, this.Root, 0);
}
/* INLINE IMPLEMENTATION */
//// The current branch where we try to match the current character.
//var branch = this.Root;
//// The offset in the current branch where we
//// compare our current character.
//int branchOffset = 0;
//for (int i = 0; i < word.Length; i++)
//{
// // Our current character.
// C chr = word[i];
// // Did we reach the end of the branch?
// if (branchOffset >= branch.Length)
// {
// // Yes, so try to find whether there is an outgoing branch
// // starting with the current character.
// var nextBranch = branch.GetChildStartingWith(chr);
// // Is there such a branch?
// if (nextBranch == null)
// {
// // No, append a branch for the rest of the word and return.
// AddBranch(word, wordItem, branch, i);
// return;
// }
// // Yes, a branch starting with the current character is found,
// // so, continue from there.
// // Take account of the found character by setting branchOffset
// // equal to one.
// branch = nextBranch;
// branchOffset = 1;
// }
// else
// {
// // No, our position is somewhere in the middle of the current branch.
// // Does our character match with the one at the position of the branch?
// if (!branch.Source[branch.StartIndex + branchOffset].Equals(chr))
// {
// // If not, split the branch and
// // append a new branch for the rest of the word at the split position
// // and return.
// branch.Split(branchOffset);
// AddBranch(word, wordItem, branch, i);
// return;
// }
// else
// {
// // If yes, advance our search position
// // in the current branch for the next character test.
// branchOffset++;
// }
// }
//}
//// If we reached this point, there was a total path overlap
//// of the word against the preexisting tree.
//// If the overlap ended in the middle of a branch...
//if (branchOffset < branch.Length)
//{
// // ...split the branch where the new word ended, so we can
// // add later the word item at the split point.
// branch.Split(branchOffset);
//}
//// Add the word item at the branch end, which is by now
//// the word path termination point.
//branch.AddWordItem(wordItem);
}
#endregion
#region Private methods
private Branch AddBranch(C[] word, D wordItem, Branch branch, int wordOffset)
{
var nextBranch = new Branch(word, wordOffset, 0);
branch.AddChild(nextBranch);
wordItemProcessor.OnWordAdd(word, wordItem, nextBranch);
return nextBranch;
}
#endregion
}
}