Line data Source code
1 : #include "WordDiffCache.h"
2 : #include "WordDiffSegmenter.h"
3 :
4 : namespace wikidiff2 {
5 :
6 : WordDiffCache::String WordDiffCache::newlineStorage = "\n";
7 : Word WordDiffCache::newline(newlineStorage.begin(),
8 : newlineStorage.begin() + 1, newlineStorage.begin() + 1);
9 :
10 170 : WordDiffCache::WordDiffPtr WordDiffCache::getDiff(const String * from, const String * to)
11 : {
12 170 : return getConcatDiff(from, 1, to, 1);
13 : }
14 :
15 768 : WordDiffCache::WordDiffPtr WordDiffCache::getConcatDiff(
16 : const String * fromStart, size_t fromSize,
17 : const String * toStart, size_t toSize)
18 : {
19 : DiffCacheKey key(
20 : getKey(fromStart), fromSize,
21 768 : getKey(toStart), toSize);
22 768 : DiffCache::iterator it = diffCache.find(key);
23 768 : if (it == diffCache.end()) {
24 651 : const WordVector & words1 = getConcatWords(fromStart, fromSize);
25 651 : const WordVector & words2 = getConcatWords(toStart, toSize);
26 0 : WordDiffPtr wordDiffPtr = std::allocate_shared<WordDiff>(WD2_ALLOCATOR<WordDiff>(),
27 651 : diffConfig, words1, words2);
28 :
29 651 : if (fromSize > 1 || toSize > 1) {
30 0 : WordDiffSegmenter::segment(*wordDiffPtr);
31 : }
32 :
33 651 : auto result = diffCache.insert(std::make_pair(key, wordDiffPtr));
34 651 : it = result.first;
35 : } else {
36 117 : hitStats.diffHits++;
37 : }
38 768 : hitStats.diffTotal++;
39 1536 : return it->second;
40 : }
41 :
42 : /**
43 : * Get an integer key from a String address.
44 : *
45 : * Maybe a weird concept, but not much worse than what we were doing in Diff
46 : * already. A hashtable would be slow since String doesn't memoize hashes.
47 : * We don't accept any pointer, just one that is inside a previously
48 : * registered vector.
49 : *
50 : * Theoretically we could track line numbers right through DiffEngine/Diff
51 : * instead of reconstructing the line numbers from the pointers.
52 : */
53 4656 : size_t WordDiffCache::getKey(const String * str)
54 : {
55 4656 : size_t r = 0;
56 6984 : for (size_t i = 0; i < 2; i++) {
57 6984 : const StringVector & lines = *(linesVecPtrs[i]);
58 6984 : size_t n = lines.size();
59 6984 : if (n && str >= &lines[0] && str <= &lines[n - 1]) {
60 4656 : return r + str - &lines[0];
61 : }
62 2328 : r += n;
63 : }
64 0 : throw std::invalid_argument("WordDiffCache::getKey: unregistered string pointer");
65 : }
66 :
67 801 : const WordDiffStats & WordDiffCache::getDiffStats(const String * from, const String * to)
68 : {
69 801 : return getConcatDiffStats(from, 1, to, 1);
70 : }
71 :
72 909 : const WordDiffStats & WordDiffCache::getConcatDiffStats(
73 : const String * fromStart, size_t fromSize,
74 : const String * toStart, size_t toSize)
75 : {
76 : DiffCacheKey key(
77 : getKey(fromStart), fromSize,
78 909 : getKey(toStart), toSize);
79 909 : StatsCache::iterator it = statsCache.find(key);
80 909 : if (it == statsCache.end()) {
81 598 : WordDiffPtr diff = getConcatDiff(fromStart, fromSize, toStart, toSize);
82 598 : auto result = statsCache.insert(std::make_pair(key, WordDiffStats(*diff)));
83 598 : it = result.first;
84 : } else {
85 311 : hitStats.statHits++;
86 : }
87 909 : hitStats.statTotal++;
88 909 : return it->second;
89 : }
90 :
91 94 : void WordDiffCache::setLines(const StringVector * lines0, const StringVector * lines1)
92 : {
93 94 : linesVecPtrs[0] = lines0;
94 94 : linesVecPtrs[1] = lines1;
95 94 : wordsCache.clear();
96 94 : diffCache.clear();
97 94 : statsCache.clear();
98 94 : }
99 :
100 : /**
101 : * Get a vector of words for a line.
102 : */
103 1302 : const WordDiffCache::WordVector & WordDiffCache::explodeWords(const String * line)
104 : {
105 1302 : WordsCacheKey key(getKey(line), 1);
106 1302 : auto it = wordsCache.find(key);
107 1302 : hitStats.wordTotal++;
108 1302 : if (it != wordsCache.end()) {
109 933 : hitStats.wordHits++;
110 933 : return it->second;
111 : }
112 369 : textUtil.explodeWords(*line, tempWords);
113 369 : auto result = wordsCache.insert(WordsCache::value_type(key, WordVector()));
114 369 : result.first->second.swap(tempWords);
115 369 : return result.first->second;
116 : }
117 :
118 : /**
119 : * Get a word vector for one or more lines. If more than one line is requested,
120 : * the lines will be concatenated with a newline separator.
121 : *
122 : * TODO: Unfortunately the concatenated word vector must stay in memory as long
123 : * as the associated diffs stay in memory, because the diffs contain pointers
124 : * to the words. An improvement would be to have DiffEngine store integer
125 : * offsets instead of pointers. This would allow concatenated word vectors to
126 : * be temporary, reducing memory usage.
127 : */
128 1302 : const WordDiffCache::WordVector & WordDiffCache::getConcatWords(
129 : const String * lines, size_t numLines)
130 : {
131 1302 : if (numLines == 1) {
132 1302 : return explodeWords(lines);
133 : }
134 :
135 0 : WordsCacheKey key(getKey(lines), numLines);
136 0 : auto it = wordsCache.find(key);
137 0 : hitStats.concatWordTotal++;
138 0 : if (it != wordsCache.end()) {
139 0 : hitStats.concatWordHits++;
140 0 : return it->second;
141 : }
142 :
143 0 : WordVector concatWords;
144 0 : size_t numWords = 0;
145 0 : for (size_t i = 0; i < numLines; i++) {
146 0 : numWords += explodeWords(lines + i).size() + 1;
147 : }
148 0 : concatWords.reserve(numWords);
149 :
150 0 : for (size_t i = 0; i < numLines; i++) {
151 0 : const WordVector & words = explodeWords(lines + i);
152 0 : if (i > 0) {
153 0 : concatWords.push_back(newline);
154 : }
155 0 : for (auto it = words.begin(); it != words.end(); it++) {
156 0 : concatWords.push_back(*it);
157 : }
158 : }
159 0 : auto result = wordsCache.insert(std::make_pair(key, WordVector()));
160 0 : result.first->second.swap(concatWords);
161 0 : return result.first->second;
162 : }
163 :
164 0 : void WordDiffCache::dumpDebugReport()
165 : {
166 0 : auto h = hitStats;
167 : using std::endl;
168 0 : std::cerr << "Diff cache: " << h.diffHits << " / " << h.diffTotal << endl
169 0 : << "Stat cache " << h.statHits << " / " << h.statTotal << endl
170 0 : << "Word cache " << h.wordHits << " / " << h.wordTotal << endl
171 0 : << "Concatenated line word cache " << h.concatWordHits << " / " << h.concatWordTotal << endl;
172 0 : }
173 :
174 0 : void WordDiffCache::throwOutOfRange()
175 : {
176 0 : throw std::out_of_range("Numeric value out of range");
177 : }
178 :
179 16841 : bool WordDiffCache::DiffCacheKey::operator<(const DiffCacheKey & other) const
180 : {
181 16841 : if (from < other.from) return true;
182 11003 : if (from > other.from) return false;
183 6222 : if (fromSize < other.fromSize) return true;
184 6222 : if (fromSize > other.fromSize) return false;
185 6222 : if (to < other.to) return true;
186 3325 : if (to > other.to) return false;
187 856 : if (toSize < other.toSize) return true;
188 856 : if (toSize > other.toSize) return false;
189 856 : return false;
190 : }
191 :
192 7648 : bool WordDiffCache::WordsCacheKey::operator<(const WordsCacheKey & other) const
193 : {
194 7648 : if (line < other.line) return true;
195 4022 : if (line > other.line) return false;
196 1866 : if (size < other.size) return true;
197 1866 : if (size > other.size) return false;
198 1866 : return false;
199 : }
200 :
201 : } // namespace wikidiff2
|