admin管理员组文章数量:1350125
I would like to split natural text into word pairs, triplets, quadruplets and on!
I have figured out how to split into pairs so far. I assume I will need an additional loop to acmodate the word count
Here is the code for pairs
var test = "I love you so much, but Joe said \"he doesn't\"!";
var words = test.split(" ");
var two_words = [];
for (var i = 0; i < words.length - 1; i++) {
two_words.push(words[i] + ' ' + words[i + 1]);
}
console.log(two_words);
// Here is what I am trying
var words = test.split(" ");
var split_words = [];
var split_length = 5;
for (var l = 2; l <= split_length; l++) {
for (var i = 0; i < words.length - (l - 1); i++) {
var split_word;
for (c = 0; c <= l; c++) {
split_word += split_words[i + c];
}
split_words.push(split_word);
}
}
console.log(split_words);
I would like to split natural text into word pairs, triplets, quadruplets and on!
I have figured out how to split into pairs so far. I assume I will need an additional loop to acmodate the word count
Here is the code for pairs
var test = "I love you so much, but Joe said \"he doesn't\"!";
var words = test.split(" ");
var two_words = [];
for (var i = 0; i < words.length - 1; i++) {
two_words.push(words[i] + ' ' + words[i + 1]);
}
console.log(two_words);
// Here is what I am trying
var words = test.split(" ");
var split_words = [];
var split_length = 5;
for (var l = 2; l <= split_length; l++) {
for (var i = 0; i < words.length - (l - 1); i++) {
var split_word;
for (c = 0; c <= l; c++) {
split_word += split_words[i + c];
}
split_words.push(split_word);
}
}
console.log(split_words);
Adding expected output...(an array of ngrams) sg like this
// 2grams
"I love"
"love you"
"you so"
"so much,"
"much, but"
"but Joe"
"Joe said"
"said "he"
""he doesn't"!"
//3grams
"I love you"
"love you so"
"you so much"
"so much, but"
//and on and on
Share
Improve this question
edited Feb 16, 2018 at 12:54
giorgio79
asked Feb 16, 2018 at 12:18
giorgio79giorgio79
4,23910 gold badges63 silver badges92 bronze badges
3
- Are you looking forward to solution which picks the pairs in sequence only? e.g. I and so is not the desired result? – Nikhil Aggarwal Commented Feb 16, 2018 at 12:25
- 1 Please share the expected output. – gurvinder372 Commented Feb 16, 2018 at 12:27
- Thx! Just added the expected result. – giorgio79 Commented Feb 16, 2018 at 13:06
4 Answers
Reset to default 8This is called "n-grams" and can be done in modern JavaScript using generators like this:
function* ngrams(a, n) {
let buf = [];
for (let x of a) {
buf.push(x);
if (buf.length === n) {
yield buf;
buf.shift();
}
}
}
var test = "The quick brown fox jumps over the lazy dog";
for (let g of ngrams(test.split(' '), 3))
console.log(g.join(' '))
Another, more concise and probably faster option:
let ngrams = (a, n) => a.slice(0, 1 - n).map((_, i) => a.slice(i, i + n));
Assuming that your desired result does not include jumbled ordered binations, you can try following
// Code goes here
var test = "I love you so much, but Joe said \"he doesn't\"!";
var arr = test.split(" ");
var words = arr.length; // total length of words
var result = [];
function process(arr, length) { // process array for number of words
var temp = [];
// use equal if want to include the plete string as well in the array
if (arr.length >= length) {
// the check excludes any left over words which do not meet the length criteria
for (var i = 0; (i + length) <= arr.length; i++) {
temp.push(arr.slice(i, length + i).join(" "));
}
result.push(temp);
process(arr, length + 1); // recursive calling
}
}
process(arr, 2);
console.log(result);
This should do what you're looking for:
function chunkIt(str,chunk) {
var words = str.split(" ");
var arr = [];
for (var i = (chunk - 1); i < words.length; i++) {
var start = i - (chunk - 1);
arr.push(words.slice(start, start + chunk));
}
return arr.map(v => v.join(" "));
}
var test = "I love you so much, but Joe said \"he doesn't\"!";
console.log(chunkIt(test,2));
console.log(chunkIt(test,3));
console.log(chunkIt(test,4));
You can dramatically shorten your code by using a library like lodash:
var word = 'foobarbaz';
var chunks = _.chunk(word, 2).map((chunk) => chunk.join(''));
console.log(chunks); //[ 'fo', 'ob', 'ar', 'ba', 'z' ]
Then you can pass in values other than 2 to suit your needs
本文标签:
版权声明:本文标题:javascript - Split string into pairs, triplets, quadruplets and on (ngrams)? - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1743871105a2553487.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论