Experimental Physics and Industrial Control System
Hi Jeff,
I've been looking at string hash functions, and have recently replaced Marty's
(as used in both gpHash and dbPvd) with one that is faster and performs as
well on most IOCs and somewhat better on many. The function is now available
from epicsString.h, and I think we should use it in resourceLib.h as well.
Unfortunately your hash function generates rather a lot of collisions with
some loads, and as with Marty's function it does require a table look-up for
every character, which slows it down compared to the pure CPU methodologies.
Here are the results of my tests, showing for each algorithm a histogram
showing how many buckets contain a specific number of items. Of the
algorithms, "mrk" is Marty's original, and 'joh' is the stringId::hash
algorithm from resourceLib.h. The algorithm I picked for epicsString.c
is "ap" by Arash Partow, which along with some of the others is described at
http://www.partow.net/programming/hashfunctions/
tux% bin/linux-x86_64/hash /usr/local/iocapps/iocinfo/pvdata/iocpss05bm 6
Read 14896 items into 16384 buckets, averaging 0.909 items per bucket
Algorithm: mrk joh ap sdbm djb rs bkdr1 bkdr2 bkdr3 bkdr4 bkdr5
0 6824 8871 6219 6705 7245 6339 6694 6358 7238 6380 6447
1 5729 4486 6471 5858 5140 6316 5869 6292 5296 6242 6157
2 2661 1501 2839 2727 2674 2796 2725 2813 2541 2835 2837
3 905 542 699 846 979 768 850 737 877 756 745
4 206 274 131 203 273 141 207 156 299 141 165
5 51 231 24 37 61 24 35 25 107 28 28
6 5 178 1 7 10 0 4 3 26 2 5
7 3 133 0 1 2 0 0 0 0 0 0
8 0 69 0 0 0 0 0 0 0 0 0
9 0 52 0 0 0 0 0 0 0 0 0
10 0 26 0 0 0 0 0 0 0 0 0
11 0 8 0 0 0 0 0 0 0 0 0
12 0 9 0 0 0 0 0 0 0 0 0
13 0 2 0 0 0 0 0 0 0 0 0
14 0 0 0 0 0 0 0 0 0 0 0
15 0 2 0 0 0 0 0 0 0 0 0
Std.Dev. 0.987 1.594 0.896 0.968 1.040 0.911 0.964 0.916 1.064 0.917 0.928
FigOfMerit 0.984 1.547 0.901 0.967 1.032 0.915 0.963 0.920 1.054 0.920 0.931
FoM Rank 7 10 0 6 8 1 5 2 9 3 4
The input data for the above test is the record list from our IOC with the
largest number of records. I have run this against many other IOC record
lists, adjusting the table size appropriately, and the "ap" algorithm usually
appears at or near the top. The source for my test program is attached; the
second argument sets the table size and should be between 0 and 8.
I would be happy to commit the necessary changes to resourceLib.h if you wish
me to.
- Andrew
--
The best FOSS code is written to be read by other humans -- Harold Welte
/* hash.cpp */
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#define NELEMENTS(arr) (sizeof arr / sizeof arr[0])
#define MAX_PV_NAME_LEN 61
typedef unsigned char uint8_t;
int bitshift;
/*
* Hash functions to test
*/
static const uint8_t T[256] = {
39,159,180,252, 71, 6, 13,164,232, 35,226,155, 98,120,154, 69,
157, 24,137, 29,147, 78,121, 85,112, 8,248,130, 55,117,190,160,
176,131,228, 64,211,106, 38, 27,140, 30, 88,210,227,104, 84, 77,
75,107,169,138,195,184, 70, 90, 61,166, 7,244,165,108,219, 51,
9,139,209, 40, 31,202, 58,179,116, 33,207,146, 76, 60,242,124,
254,197, 80,167,153,145,129,233,132, 48,246, 86,156,177, 36,187,
45, 1, 96, 18, 19, 62,185,234, 99, 16,218, 95,128,224,123,253,
42,109, 4,247, 72, 5,151,136, 0,152,148,127,204,133, 17, 14,
182,217, 54,199,119,174, 82, 57,215, 41,114,208,206,110,239, 23,
189, 15, 3, 22,188, 79,113,172, 28, 2,222, 21,251,225,237,105,
102, 32, 56,181,126, 83,230, 53,158, 52, 59,213,118,100, 67,142,
220,170,144,115,205, 26,125,168,249, 66,175, 97,255, 92,229, 91,
214,236,178,243, 46, 44,201,250,135,186,150,221,163,216,162, 43,
11,101, 34, 37,194, 25, 50, 12, 87,198,173,240,193,171,143,231,
111,141,191,103, 74,245,223, 20,161,235,122, 63, 89,149, 73,238,
134, 68, 93,183,241, 81,196, 49,192, 65,212, 94,203, 10,200, 47
};
int mrk(const char *str, int length)
{
const int n = length & ~1;
uint8_t h0 = 0;
uint8_t h1 = 0;
int i;
if (length & 1)
h0 = T[h0 ^ str[n]];
for (i = 0; i < n; i += 2) {
h0 = T[h0 ^ *str++];
h1 = T[h1 ^ *str++];
}
return ((int)h1 << bitshift) ^ h0;
}
int joh(const char *str, int length)
{
const unsigned char *pUStr = reinterpret_cast<const unsigned char *>(str);
unsigned h0 = 0;
unsigned h1 = 0;
unsigned h2 = 0;
unsigned h3 = 0;
unsigned c;
while (1) {
if ((c = *pUStr++) == 0) break;
h0 = T[h0 ^ c];
if ((c = *pUStr++) == 0) break;
h1 = T[h1 ^ c];
if ((c = *pUStr++) == 0) break;
h2 = T[h2 ^ c];
if ((c = *pUStr++) == 0) break;
h3 = T[h3 ^ c];
}
size_t hash = (h3 << 24) | (h2 << 16) | (h1 << 8) | h0;
unsigned width = 32;
do {
width >>= 1u;
hash ^= hash >> width;
} while (width > 8);
return hash;
}
int ap(const char *str, int length)
{
unsigned int hash = 0;
while (length-- > 0) {
hash ^= ~((hash << 11) ^ *str++ ^ (hash >> 5));
if (!(length-- > 0)) break;
hash ^= (hash << 7) ^ *str++ ^ (hash >> 3);
}
return hash;
}
int rs(const char *str, int length)
{
int a = 63689, b = 378551;
int hash = 0;
while (length-- > 0) {
hash = hash * a + *str++;
a *= b;
}
return hash;
}
int js(const char *str, int length)
{
int hash = 1315423911;
while (length-- > 0) {
hash ^= ((hash << 5) + (hash >> 2) + *str++);
}
return hash;
}
int muladd(const char *str, int length, int hash, int mul)
{
while (length-- > 0) {
hash = hash*mul + *str++;
}
return hash;
}
int bkdr1(const char *str, int length)
{
return muladd(str, length, 0, 31);
}
int bkdr2(const char *str, int length)
{
return muladd(str, length, 0, 131);
}
int bkdr3(const char *str, int length)
{
return muladd(str, length, 0, 1313);
}
int bkdr4(const char *str, int length)
{
return muladd(str, length, 0, 13131);
}
int bkdr5(const char *str, int length)
{
return muladd(str, length, 0, 131313);
}
int sdbm(const char *str, int length)
{
return muladd(str, length, 0, 65599);
}
int djb(const char *str, int length)
{
return muladd(str, length, 5381, 33);
}
/* Test structure */
typedef int (*HASHFUNC)(const char *, int);
#define h(f) {#f, f}
static struct {
char *name;
HASHFUNC func;
} hashes[] = {
h(mrk),
h(joh),
h(ap),
h(sdbm),
h(djb),
h(rs),
h(bkdr1),
h(bkdr2),
h(bkdr3),
h(bkdr4),
h(bkdr5),
};
#define NALGORITHMS ((int)NELEMENTS(hashes))
int main(int argc,char **argv)
{
if (argc != 3) {
fprintf(stderr, "usage: hash filename shift\n");
exit(1);
}
bitshift = atoi(argv[2]);
if (bitshift < 0 || bitshift > 8) {
printf("hash: shift parameter must be between 0 and 8\n");
exit(1);
}
int nBuckets = (256 << bitshift);
unsigned mask = (nBuckets - 1);
unsigned bucket[nBuckets][NALGORITHMS];
for (int h = 0; h < nBuckets; h++)
for (int alg = 0; alg < NALGORITHMS; alg++)
bucket[h][alg] = 0;
FILE *fp = fopen(argv[1], "r");
if (!fp) {
perror("hash");
exit(1);
}
unsigned items = 0;
unsigned largest = 0;
while(!feof(fp)) {
char tempStr[MAX_PV_NAME_LEN];
char *pstr = fgets(tempStr, MAX_PV_NAME_LEN, fp);
if (!pstr) break;
int len = strlen(pstr);
if (len <= 1) continue;
pstr[--len] = 0; /* Ignore trailing newline */
items++;
for (int alg = 0; alg < NALGORITHMS; alg++) {
int hash = hashes[alg].func(pstr, len) & mask;
unsigned b = ++bucket[hash][alg];
if (b > largest) largest = b;
}
}
fclose(fp);
if (items == 0) {
printf("File empty\n");
exit(0);
}
printf("Read %u items into %d buckets, averaging %.3f items per bucket\n\n",
items, nBuckets, (double)items / nBuckets);
const unsigned nfreqs = largest + 1;
unsigned freq[NALGORITHMS][nfreqs];
double X[NALGORITHMS] = {0.0};
double XX[NALGORITHMS] = {0.0};
for (int alg = 0; alg < NALGORITHMS; alg++)
for (unsigned k = 0; k < nfreqs; k++)
freq[alg][k] = 0;
for (int h = 0; h < nBuckets; h++) {
for (int alg = 0; alg < NALGORITHMS; alg++) {
unsigned count = bucket[h][alg];
++freq[alg][count];
if (count) {
X[alg] += count;
XX[alg] += count * count;
}
}
}
double stdDev[NALGORITHMS];
for (int alg = 0; alg < NALGORITHMS; alg++) {
double mean = X[alg] / items;
stdDev[alg] = sqrt(XX[alg] / items - mean * mean);
}
double F[NALGORITHMS] = {0.0};
double FF[NALGORITHMS] = {0.0};
for (int alg = 0; alg < NALGORITHMS; alg++) {
for (unsigned k = 0; k < nfreqs; k++) {
double f = k*freq[alg][k];
F[alg] += f;
FF[alg] += f * k;
}
}
double fom[NALGORITHMS];
unsigned rank[NALGORITHMS] = {0};
for (int alg = 0; alg < NALGORITHMS; alg++) {
double mean = F[alg] / nBuckets;
fom[alg] = sqrt(FF[alg] / nBuckets - mean * mean);
for (int j = 0; j < alg; j++)
if (fom[j] != fom[alg])
(fom[j] > fom[alg]) ? ++rank[j] : ++rank[alg];
}
printf("Algorithm:");
for (int alg = 0; alg < NALGORITHMS; alg++)
printf(" %5s", hashes[alg].name);
printf("\n");
for (unsigned u = 0; u < nfreqs; u++) {
printf(" %6u ", u);
for (int alg = 0; alg < NALGORITHMS; alg++)
printf(" %5u", freq[alg][u]);
printf("\n");
}
printf("\n");
printf("Std.Dev. ");
for (int alg = 0; alg < NALGORITHMS; alg++) {
printf(" %5.3f", stdDev[alg]);
}
printf("\n");
printf("FigOfMerit");
for (int alg = 0; alg < NALGORITHMS; alg++) {
printf(" %5.3f", fom[alg]);
}
printf("\n\n");
printf("FoM Rank ");
for (int alg = 0; alg < NALGORITHMS; alg++) {
printf(" %5u", rank[alg]);
}
printf("\n");
return 0;
}
- Replies:
- RE: String hash functions and resourceLib.h Jeff Hill
- Navigate by Date:
- Prev:
RE: SIGALRM => SIGUSR1? Jeff Hill
- Next:
RE: String hash functions and resourceLib.h Jeff Hill
- Index:
2002
2003
2004
2005
2006
2007
2008
<2009>
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
- Navigate by Thread:
- Prev:
RE: SIGALRM => SIGUSR1? Jeff Hill
- Next:
RE: String hash functions and resourceLib.h Jeff Hill
- Index:
2002
2003
2004
2005
2006
2007
2008
<2009>
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024