admin管理员组文章数量:1277315
How would you approach finding duplicates in a text document. Duplicates can be a set of consecutive words or sentence. Sentence not necessary ending with a dot. let's say page contains a document of 200 lines of which 2 sentences are identical, we want to highlight those 2 sentences as duplicates when "check duplicate button" is clicked.
How would you approach finding duplicates in a text document. Duplicates can be a set of consecutive words or sentence. Sentence not necessary ending with a dot. let's say page contains a document of 200 lines of which 2 sentences are identical, we want to highlight those 2 sentences as duplicates when "check duplicate button" is clicked.
Share Improve this question edited Feb 19, 2011 at 1:39 Hussein asked Feb 18, 2011 at 20:21 HusseinHussein 42.8k25 gold badges115 silver badges143 bronze badges 1-
Let's say.
<body><p>....text.......</p></body>
– Hussein Commented Feb 18, 2011 at 20:56
3 Answers
Reset to default 5Interesting question — here's an idea on how I'd do that probably: http://jsfiddle/SaQAs/1/ — Not anyhow optimized!
var text = $('p').text(),
words = text.split(' '),
sortedWords = words.slice(0).sort(),
duplicateWords = [],
sentences = text.split('.'),
sortedSentences = sentences.slice(0).sort(),
duplicateSentences = [];
for (var i=0; i<sortedWords.length-1; i++) {
if (sortedWords[i+1] == sortedWords[i]) {
duplicateWords.push(sortedWords[i]);
}
}
duplicateWords = $.unique(duplicateWords);
for (var i=0; i<sortedSentences.length-1; i++) {
if (sortedSentences[i+1] == sortedSentences[i]) {
duplicateSentences.push(sortedSentences[i]);
}
}
duplicateSentences = $.unique(duplicateSentences);
$('a.words').click(function(){
var highlighted = $.map(words, function(word){
if ($.inArray(word, duplicateWords) > -1)
return '<span class="duplicate">' + word + '</span>';
else return word;
});
$('p').html(highlighted.join(' '));
return false;
});
$('a.sentences').click(function(){
var highlighted = $.map(sentences, function(sentence){
if ($.inArray(sentence, duplicateSentences) > -1)
return '<span class="duplicate">' + sentence + '</span>';
else return sentence;
});
$('p').html(highlighted.join('.'));
return false;
});
Update 1
This one finds sequences of identical words: http://jsfiddle/YQdk5/1/ From here it shouldn't be hard to e.g. ignore any kind of punctuation at the end of fragments when paring — you'd would just have to write your own version of an inArray
method.
var text = $('p').text(),
words = text.split(' '),
sortedWords = words.slice(0).sort(),
duplicateWords = []
highlighted = [];
for (var i=0; i<sortedWords.length-1; i++) {
if (sortedWords[i+1] == sortedWords[i]) {
duplicateWords.push(sortedWords[i]);
}
}
duplicateWords = $.unique(duplicateWords);
for (var j=0, m=[]; j<words.length; j++) {
m.push($.inArray(words[j], duplicateWords) > -1);
if (!m[j] && m[j-1])
highlighted.push('</span>');
else if (m[j] && !m[j-1])
highlighted.push('<span class="duplicate">');
highlighted.push(words[j]);
}
$('p').html(highlighted.join(' '));
Update 2
My regex-fu is weak, but this (pretty messy!) version seems to work okay: http://jsfiddle/YQdk5/2/ — I'm pretty sure that there might be a better way of doing this, but for now I've got to leave it alone! :D — Good luck!
Update 3
Thinking about it, I don't think that the code from the previous update is any good. That's why I've removed it. You can still find it here: http://jsfiddle/YQdk5/2/ The main point is to use a regex to match words, something along the lines of:
/^word(\.?)$/
Here is the solution that uses suffix tree:
function SuffixTree(text) {
var regex = /\b\w+/g;
var words = text.match(regex);
var wave = [];
var words_l = words.length;
if (words_l == 0) return false;
this.tree = this.node("", false);
for (var i = 0; i < words_l; ++i) {
var x = words[i] + "_";
wave.push(this.tree);
var wave_l = wave.length;
for (var j = 0; j < wave_l; ++j) {
var y = wave[j];
if (typeof y[x] != 'undefined') y[x].count++;
else y[x] = this.node(words[i], y);
wave[j] = y[x];
}
}
}
SuffixTree.prototype = {
dummy: {count: 1},
node: function(word, num, parent) {
return {
count: 1,
word: word,
parent: parent
};
},
duplicates: function(h) {
this.dups = [];
this.bypass(this.tree, h, 0);
var l = this.dups.length;
this.dups.sort(function(d1, d2) { return d1.depth > d2.depth ? 1 : -1; });
for (var i = 0; i < l; ++i) {
var d = this.dups[i];
this.dups[i] = { s: " " + this.sentence(d.a) + " ", depth: d.depth, count: d.a.count };
}
for (var i = 0; i < l; ++i) {
var d = this.dups[i];
console.log(i, d.s);
}
for (var i = 0; i < l; ++i) {
var d = this.dups[i];
var fl = true;
for (var j = i + 1; j < l; ++j) {
if (this.dups[j].s.indexOf(d.s) != -1) fl = false;
}
if (fl) h(d.s.substr(1, d.s.length - 2), d.count);
}
},
bypass: function(a, h, depth) {
if (a.constructor != Object) return;
var fl = true;
for (var i in a) {
if (i == 'parent') continue;
var b = a[i];
if (b.count == a.count) fl = false;
this.bypass(b, h, depth + 1);
}
if (fl && a.count > 1) {
this.dups.push({ a: a, depth: depth });
}
},
sentence: function(a) {
var s = a.word;
while (a = a.parent) {
s = a.word + " " + s;
}
return s;
}
};
var text = "This is a text with some duplicates: words, sentences of different length. For example here is a duplicate word. This sentence has some duplicates. But not all of us can find clones.";
var T = new SuffixTree(text);
var h = function(s, c) {
document.write(s + "[" + c + "]<br/>");
};
T.duplicates(h);
1) Split the input text to the array of words. 2) Build the suffix tree. 3) Find the longest suffixes of the tree. 4) Remove sentences that are contained in others (i.e. remove "is" that is a part of "this is a").
You can change the regular expression to take into account html tags.
I hope this helps you.
P.S. h
is the callback for found duplicates.
You're javascript contains references to a javascript library named jQuery.
You are not including this in your HTML, and thus it will fail. You can include it via jquery cdn
And todays tip: Use the developer tools in your browser. In the console you can see what parts of javascript that fails.
本文标签: JavascriptjQuery Find Text duplicatesStack Overflow
版权声明:本文标题:JavascriptjQuery Find Text duplicates - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1741274244a2369635.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论