admin管理员组文章数量:1317123
Just wondering if there is some other way than this.
var hashStringArray = function(array) {
array.sort();
return array.join('|');
};
I don't like sorting much and using that delimiter is not safe either if it's contained in one of the strings. In overall I need to produce same hash no matter the order of strings. It will be rather short arrays (up to 10 items), but it will be required very often so it shouldn't be too slow.
I intend to use it with ES6 Map object and I need to easily find same array collection.
Updated example of use
var theMap = new Map();
var lookup = function(arr) {
var item = null;
var hashed = hashStringArray(arr);
if (item = theMap.get( hashed )) {
return item;
}
theMap.set( hashed, itemBasedOnInput );
return itemBasedOnInput;
}
var arr1 = ['alpha','beta','gama'];
var arr2 = ['beta','alpha','gama'];
lookup(arr1) === lookup(arr2)
Performance tests
Just wondering if there is some other way than this.
var hashStringArray = function(array) {
array.sort();
return array.join('|');
};
I don't like sorting much and using that delimiter is not safe either if it's contained in one of the strings. In overall I need to produce same hash no matter the order of strings. It will be rather short arrays (up to 10 items), but it will be required very often so it shouldn't be too slow.
I intend to use it with ES6 Map object and I need to easily find same array collection.
Updated example of use
var theMap = new Map();
var lookup = function(arr) {
var item = null;
var hashed = hashStringArray(arr);
if (item = theMap.get( hashed )) {
return item;
}
theMap.set( hashed, itemBasedOnInput );
return itemBasedOnInput;
}
var arr1 = ['alpha','beta','gama'];
var arr2 = ['beta','alpha','gama'];
lookup(arr1) === lookup(arr2)
Performance tests
http://jsperf./hashing-array-of-strings/5
Share Improve this question edited Aug 3, 2014 at 16:39 FredyC asked Aug 3, 2014 at 12:02 FredyCFredyC 4,0674 gold badges31 silver badges40 bronze badges 19-
I could be wrong but doesn't ES6 Map support non-string keys? i.e.
theMap.set(arr1, "test")
. Still, you could use JSON.stringify(arr1) or create a "real" hash function that converts a string into a number... – Bart Commented Aug 3, 2014 at 12:11 - 2 @FredyC Can you please give an example where the delimiter would be a problem? – thefourtheye Commented Aug 3, 2014 at 12:17
-
2
On the other hand, you could just use
arr.join("\0")
, it seems unlikely anyone will put a null character in a string... – Bart Commented Aug 3, 2014 at 12:19 -
3
@thefourtheye If you use
|
as the delimiter, these two arrays would produce the same hash:["abc", "def|ghi"]
and["abc","def","ghi"]
. – Barmar Commented Aug 3, 2014 at 12:22 - 1 Also, there's no guarantee that MD5 will be unique, although collisions are unlikely. – Barmar Commented Aug 3, 2014 at 12:23
4 Answers
Reset to default 6Two things occurred to me as the basis of a solution:
summing doesn't depend on order, which is actually a flaw in simple checksums (they don't catch changes in block order within a word), and
we can convert strings to summable numbers using their charcodes
Here's a function to do (2) :
charsum = function(s) {
var i, sum = 0;
for (i = 0; i < s.length; i++) {
sum += (s.charCodeAt(i) * (i+1));
}
return sum
}
Here's a version of (1) that putes an array hash by summing the charsum values:
array_hash = function(a) {
var i, sum = 0
for (i = 0; i < a.length; i++) {
var cs = charsum(a[i])
sum = sum + (65027 / cs)
}
return ("" + sum).slice(0,16)
}
Fiddle here: http://jsfiddle/WS9dC/11/
If we did a straight sum of the charsum values, then the array ["a", "d"] would have the same hash as the array ["b", "c"] - leading to undesired collisions. So based on using non-UTF strings, where charcodes go up to 255, and allowing for 255 characters in each string, then the max return value of charsum is 255 * 255 = 65025. So I picked the next prime number up, 65027, and used (65027 / cs) to pute the hash. I am not 100% convinced this removes collisions... perhaps more thought needed... but it certainly fixes the [a, d] versus [b, c] case. Testing:
var arr1 = ['alpha','beta','gama'];
var arr2 = ['beta','alpha','gama'];
console.log(array_hash(arr1))
console.log(array_hash(arr2))
console.log(array_hash(arr1) == array_hash(arr2))
Outputs:
443.5322979371356
443.5322979371356
true
And testing a case that shows different hashes:
var arr3 = ['a', 'd'];
var arr4 = ['b', 'c'];
console.log(array_hash(arr3))
console.log(array_hash(arr4))
console.log(array_hash(arr3) == array_hash(arr4))
outputs:
1320.651443298969
1320.3792001649144
false
Edit:
Here's a revised version, which ignore duplicates from the arrays as it goes, and return the hash based on unique items only:
http://jsfiddle/WS9dC/7/
array_hash = function(a) {
var i, sum = 0, product = 1
for (i = 0; i < a.length; i++) {
var cs = charsum(a[i])
if (product % cs > 0) {
product = product * cs
sum = sum + (65027 / cs)
}
}
return ("" + sum).slice(0, 16)
}
testing:
var arr1 = ['alpha', 'beta', 'gama', 'delta', 'theta', 'alpha', 'gama'];
var arr2 = ["beta", "gama", "alpha", "theta", "delta", "beta"];
console.log(array_hash(arr1))
console.log(array_hash(arr2))
console.log(array_hash(arr1) === array_hash(arr2))
returns:
689.878503111701
689.878503111701
true
Edit
I've revised the answer above to account for arrays of words that have the same letters. We need these to return different hashes, which they now do:
var arr1 = ['alpha', 'beta']
var arr2 = ['alhpa', 'ateb']
The fix was to add a multiplier to the charsum func based on the char index:
sum += (s.charCodeAt(i) * (i+1));
If you calculate a numeric hash code for each string, then you can bine them with an operator where the order doesn't matter, like the ^
XOR operator, then you don't need to sort the array:
function hashStringArray(array) {
var code = 0;
for (var i = 0; i < array.length; i++) {
var n = 0;
for (var j = 0; j < array[i].length; j++) {
n = n * 251 ^ array[i].charCodeAt(j);
}
code ^= n;
}
return code
};
You can do this:
var hashStringArray = function(array) {
return array.sort().join('\u200b');
};
The \u200b
character is an unicode character that also means null
, but is not the same as the \0
character, which is most widely used.
'\u200b' == '\0'
> false
An idea to have very fast hash if your set of possible string is less than 32 items long : hash the string with a built-in hash function that will return power-of two as hash :
function getStringHash(aString) {
var currentPO2 = 0;
var hashSet = [];
getStringHash = function ( aString) {
var aHash = hashSet[aString];
if (aHash) return aHash;
aHash = 1 << currentPO2++;
hashSet[aString] = aHash;
return aHash;
}
return getStringHash(aString);
}
Then use this hash on your string array, ORing the hashes ( | ) :
function getStringArrayHash( aStringArray) {
var aHash = 0;
for (var i=0; i<aStringArray.length; i++) {
aHash |= getStringHash(aStringArray[i]);
}
return aHash;
}
So to test a bit :
console.log(getStringHash('alpha')); // 1
console.log(getStringHash('beta')); // 2
console.log(getStringHash('gamma')); // 4
console.log(getStringHash('alpha')); // 1 again
var arr1 = ['alpha','beta','gama'];
var arr2 = ['beta','alpha','gama'];
var arr3 = ['alpha', 'teta'];
console.log(getStringArrayHash(arr1)); // 11
console.log(getStringArrayHash(arr2)); // 11 also, like for arr1
var arr3 = ['alpha', 'teta'];
console.log(getStringArrayHash(arr3)); // 17 : a different array has != hashset
jsbin is here : http://jsbin./rozanufa/1/edit?js,console
RQ !!! with this method, arrays are considered as set, meaning that a repeated item won't change the hash of an array !!!
This HAS to be faster since it uses only 1) function call 2) lookup 3) integer arithmetic. So no sort, no (long) string, no concat.
jsperf confirms that : http://jsperf./hashing-array-of-strings/4
EDIT :
version with prime numbers, here : http://jsbin./rozanufa/3/edit?js,console
// return the unique prime associated with the string.
function getPrimeStringHash(aString) {
var hashSet = [];
var currentPrimeIndex = 0;
var primes = [ 2, 3, 5, 7, 11, 13, 17 ];
getPrimeStringHash = function ( aString) {
var aPrime = hashSet[aString];
if (aPrime) return aPrime;
if (currentPrimeIndex == primes.length) aPrime = getNextPrime();
else aPrime = primes[currentPrimeIndex];
currentPrimeIndex++
hashSet[aString] = aPrime;
return aPrime;
};
return getPrimeStringHash(aString);
// pute next prime number, store it and returns it.
function getNextPrime() {
var pr = primes[primes.length-1];
do {
pr+=2;
var divides = false;
// discard the number if it divides by one earlier prime.
for (var i=0; i<primes.length; i++) {
if ( ( pr % primes[i] ) == 0 ) {
divides = true;
break;
}
}
} while (divides == true)
primes.push(pr);
return pr;
}
}
function getStringPrimeArrayHash( aStringArray) {
var primeMul = 1;
for (var i=0; i<aStringArray.length; i++) {
primeMul *= getPrimeStringHash(aStringArray[i]);
}
return primeMul;
}
function pareByPrimeHash( aStringArray, anotherStringArray) {
var mul1 = getStringPrimeArrayHash ( aStringArray ) ;
var mul2 = getStringPrimeArrayHash ( anotherStringArray ) ;
return ( mul1 > mul2 ) ?
! ( mul1 % mul2 )
: ! ( mul2 % mul1 );
// Rq : just test for mul1 == mul2 if you are sure there's no duplicates
}
Tests :
console.log(getPrimeStringHash('alpha')); // 2
console.log(getPrimeStringHash('beta')); // 3
console.log(getPrimeStringHash('gamma')); // 5
console.log(getPrimeStringHash('alpha')); // 2 again
console.log(getPrimeStringHash('a1')); // 7
console.log(getPrimeStringHash('a2')); // 11
var arr1 = ['alpha','beta','gamma'];
var arr2 = ['beta','alpha','gamma'];
var arr3 = ['alpha', 'teta'];
var arr4 = ['alpha','beta','gamma', 'alpha']; // == arr1 + duplicate 'alpha'
console.log(getStringPrimeArrayHash(arr1)); // 30
console.log(getStringPrimeArrayHash(arr2)); // 30 also, like for arr1
var arr3 = ['alpha', 'teta'];
console.log(getStringPrimeArrayHash(arr3)); // 26 : a different array has != hashset
console.log(pareByPrimeHash(arr1, arr2) ); // true
console.log(pareByPrimeHash(arr1, arr3) ); // false
console.log(pareByPrimeHash(arr1, arr4) ); // true despite duplicate
本文标签: Hashing array of strings in javascriptStack Overflow
版权声明:本文标题:Hashing array of strings in javascript - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1742004407a2411666.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论