EPICS String hash functions and resourceLib.h

Experimental Physics and Industrial Control System

2002 2003 2004 2005 2006 2007 2008 <2009> 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024	Index	2002 2003 2004 2005 2006 2007 2008 <2009> 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
<== Date ==>		<== Thread ==>

Subject:	String hash functions and resourceLib.h
From:	Andrew Johnson <[email protected]>
To:	Jeff Hill <[email protected]>
Cc:	Core-Talk <[email protected]>
Date:	Mon, 23 Mar 2009 16:14:12 -0500

Hi Jeff,

I've been looking at string hash functions, and have recently replaced Marty's 
(as used in both gpHash and dbPvd) with one that is faster and performs as 
well on most IOCs and somewhat better on many.  The function is now available 
from epicsString.h, and I think we should use it in resourceLib.h as well.  
Unfortunately your hash function generates rather a lot of collisions with 
some loads, and as with Marty's function it does require a table look-up for 
every character, which slows it down compared to the pure CPU methodologies.

Here are the results of my tests, showing for each algorithm a histogram 
showing how many buckets contain a specific number of items.  Of the 
algorithms, "mrk" is Marty's original, and 'joh' is the stringId::hash 
algorithm from resourceLib.h.  The algorithm I picked for epicsString.c 
is "ap" by Arash Partow, which along with some of the others is described at 
http://www.partow.net/programming/hashfunctions/


tux% bin/linux-x86_64/hash /usr/local/iocapps/iocinfo/pvdata/iocpss05bm 6
Read 14896 items into 16384 buckets, averaging 0.909 items per bucket

Algorithm:    mrk   joh    ap  sdbm   djb    rs bkdr1 bkdr2 bkdr3 bkdr4 bkdr5
      0      6824  8871  6219  6705  7245  6339  6694  6358  7238  6380  6447
      1      5729  4486  6471  5858  5140  6316  5869  6292  5296  6242  6157
      2      2661  1501  2839  2727  2674  2796  2725  2813  2541  2835  2837
      3       905   542   699   846   979   768   850   737   877   756   745
      4       206   274   131   203   273   141   207   156   299   141   165
      5        51   231    24    37    61    24    35    25   107    28    28
      6         5   178     1     7    10     0     4     3    26     2     5
      7         3   133     0     1     2     0     0     0     0     0     0
      8         0    69     0     0     0     0     0     0     0     0     0
      9         0    52     0     0     0     0     0     0     0     0     0
     10         0    26     0     0     0     0     0     0     0     0     0
     11         0     8     0     0     0     0     0     0     0     0     0
     12         0     9     0     0     0     0     0     0     0     0     0
     13         0     2     0     0     0     0     0     0     0     0     0
     14         0     0     0     0     0     0     0     0     0     0     0
     15         0     2     0     0     0     0     0     0     0     0     0

Std.Dev.    0.987 1.594 0.896 0.968 1.040 0.911 0.964 0.916 1.064 0.917 0.928
FigOfMerit  0.984 1.547 0.901 0.967 1.032 0.915 0.963 0.920 1.054 0.920 0.931

FoM Rank        7    10     0     6     8     1     5     2     9     3     4


The input data for the above test is the record list from our IOC with the 
largest number of records.  I have run this against many other IOC record 
lists, adjusting the table size appropriately, and the "ap" algorithm usually 
appears at or near the top.  The source for my test program is attached; the 
second argument sets the table size and should be between 0 and 8.

I would be happy to commit the necessary changes to resourceLib.h if you wish 
me to.

- Andrew
-- 
The best FOSS code is written to be read by other humans -- Harold Welte

/* hash.cpp */

#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#define NELEMENTS(arr) (sizeof arr / sizeof arr[0])

#define MAX_PV_NAME_LEN 61

typedef unsigned char uint8_t;

int bitshift;

/*
 * Hash functions to test
 */
static const uint8_t T[256] = {
     39,159,180,252, 71,  6, 13,164,232, 35,226,155, 98,120,154, 69,
    157, 24,137, 29,147, 78,121, 85,112,  8,248,130, 55,117,190,160,
    176,131,228, 64,211,106, 38, 27,140, 30, 88,210,227,104, 84, 77,
     75,107,169,138,195,184, 70, 90, 61,166,  7,244,165,108,219, 51,
      9,139,209, 40, 31,202, 58,179,116, 33,207,146, 76, 60,242,124,
    254,197, 80,167,153,145,129,233,132, 48,246, 86,156,177, 36,187,
     45,  1, 96, 18, 19, 62,185,234, 99, 16,218, 95,128,224,123,253,
     42,109,  4,247, 72,  5,151,136,  0,152,148,127,204,133, 17, 14,
    182,217, 54,199,119,174, 82, 57,215, 41,114,208,206,110,239, 23,
    189, 15,  3, 22,188, 79,113,172, 28,  2,222, 21,251,225,237,105,
    102, 32, 56,181,126, 83,230, 53,158, 52, 59,213,118,100, 67,142,
    220,170,144,115,205, 26,125,168,249, 66,175, 97,255, 92,229, 91,
    214,236,178,243, 46, 44,201,250,135,186,150,221,163,216,162, 43,
     11,101, 34, 37,194, 25, 50, 12, 87,198,173,240,193,171,143,231,
    111,141,191,103, 74,245,223, 20,161,235,122, 63, 89,149, 73,238,
    134, 68, 93,183,241, 81,196, 49,192, 65,212, 94,203, 10,200, 47
};

int mrk(const char *str, int length)
{
    const int n = length & ~1;
    uint8_t h0 = 0;
    uint8_t h1 = 0;
    int i;

    if (length & 1)
        h0 = T[h0 ^ str[n]];
    for (i = 0; i < n; i += 2) {
        h0 = T[h0 ^ *str++];
        h1 = T[h1 ^ *str++];
    }
    return ((int)h1 << bitshift) ^ h0;
}

int joh(const char *str, int length)
{
    const unsigned char *pUStr = reinterpret_cast<const unsigned char *>(str);
    unsigned h0 = 0;
    unsigned h1 = 0;
    unsigned h2 = 0;
    unsigned h3 = 0;
    unsigned c;

    while (1) {
        if ((c = *pUStr++) == 0) break;
        h0 = T[h0 ^ c];
        if ((c = *pUStr++) == 0) break;
        h1 = T[h1 ^ c];
        if ((c = *pUStr++) == 0) break;
        h2 = T[h2 ^ c];
        if ((c = *pUStr++) == 0) break;
        h3 = T[h3 ^ c];
    }

    size_t hash = (h3 << 24) | (h2 << 16) | (h1 << 8) | h0;
    unsigned width = 32;
    do {
        width >>= 1u;
        hash ^= hash >> width;
    } while (width > 8);

    return hash;
}

int ap(const char *str, int length)
{
    unsigned int hash = 0;

    while (length-- > 0) {
        hash ^= ~((hash << 11) ^ *str++ ^ (hash >> 5));
        if (!(length-- > 0)) break;
        hash ^= (hash << 7) ^ *str++ ^ (hash >> 3);
    }
    return hash;
}


int rs(const char *str, int length)
{
    int a = 63689, b = 378551;
    int hash = 0;
    while (length-- > 0) {
        hash = hash * a + *str++;
        a *= b;
    }
    return hash;
}

int js(const char *str, int length)
{
    int hash = 1315423911;

    while (length-- > 0) {
        hash ^= ((hash << 5) + (hash >> 2) + *str++);
    }
    return hash;
}

int muladd(const char *str, int length, int hash, int mul)
{
    while (length-- > 0) {
        hash = hash*mul + *str++;
    }
    return hash;
}

int bkdr1(const char *str, int length)
{
    return muladd(str, length, 0, 31);
}

int bkdr2(const char *str, int length)
{
    return muladd(str, length, 0, 131);
}

int bkdr3(const char *str, int length)
{
    return muladd(str, length, 0, 1313);
}

int bkdr4(const char *str, int length)
{
    return muladd(str, length, 0, 13131);
}

int bkdr5(const char *str, int length)
{
    return muladd(str, length, 0, 131313);
}

int sdbm(const char *str, int length)
{
    return muladd(str, length, 0, 65599);
}

int djb(const char *str, int length)
{
    return muladd(str, length, 5381, 33);
}

/* Test structure */

typedef int (*HASHFUNC)(const char *, int);

#define h(f) {#f, f}
static struct {
    char *name;
    HASHFUNC func;
} hashes[] = {
    h(mrk),
    h(joh),
    h(ap),
    h(sdbm),
    h(djb),
    h(rs),
    h(bkdr1),
    h(bkdr2),
    h(bkdr3),
    h(bkdr4),
    h(bkdr5),
};
#define NALGORITHMS ((int)NELEMENTS(hashes))

int main(int argc,char **argv)
{

    if (argc != 3) {
        fprintf(stderr, "usage: hash filename shift\n");
        exit(1);
    }

    bitshift = atoi(argv[2]);
    if (bitshift < 0 || bitshift > 8) {
        printf("hash: shift parameter must be between 0 and 8\n");
        exit(1);
    }

    int nBuckets = (256 << bitshift);
    unsigned mask = (nBuckets - 1);
    unsigned bucket[nBuckets][NALGORITHMS];
    for (int h = 0; h < nBuckets; h++)
        for (int alg = 0; alg < NALGORITHMS; alg++)
            bucket[h][alg] = 0;

    FILE *fp = fopen(argv[1], "r");
    if (!fp) {
        perror("hash");
        exit(1);
    }

    unsigned items = 0;
    unsigned largest = 0;
    while(!feof(fp)) {
        char tempStr[MAX_PV_NAME_LEN];

        char *pstr = fgets(tempStr, MAX_PV_NAME_LEN, fp);
        if (!pstr) break;
        int len = strlen(pstr);
        if (len <= 1) continue;
        pstr[--len] = 0; /* Ignore trailing newline */

        items++;
        for (int alg = 0; alg < NALGORITHMS; alg++) {
            int hash = hashes[alg].func(pstr, len) & mask;
            unsigned b = ++bucket[hash][alg];
            if (b > largest) largest = b;
        }
    }
    fclose(fp);

    if (items == 0) {
        printf("File empty\n");
        exit(0);
    }
    printf("Read %u items into %d buckets, averaging %.3f items per bucket\n\n",
        items, nBuckets, (double)items / nBuckets);


    const unsigned nfreqs = largest + 1;
    unsigned freq[NALGORITHMS][nfreqs];
    double X[NALGORITHMS] = {0.0};
    double XX[NALGORITHMS] = {0.0};
    for (int alg = 0; alg < NALGORITHMS; alg++)
        for (unsigned k = 0; k < nfreqs; k++)
            freq[alg][k] = 0;

    for (int h = 0; h < nBuckets; h++) {
        for (int alg = 0; alg < NALGORITHMS; alg++) {
            unsigned count = bucket[h][alg];
            ++freq[alg][count];
            if (count) {
                X[alg] += count;
                XX[alg] += count * count;
            }
        }
    }

    double stdDev[NALGORITHMS];
    for (int alg = 0; alg < NALGORITHMS; alg++) {
        double mean = X[alg] / items;
        stdDev[alg] = sqrt(XX[alg] / items - mean * mean);
    }

    double F[NALGORITHMS] = {0.0};
    double FF[NALGORITHMS] = {0.0};
    for (int alg = 0; alg < NALGORITHMS; alg++) {
        for (unsigned k = 0; k < nfreqs; k++) {
            double f = k*freq[alg][k];
            F[alg] += f;
            FF[alg] += f * k;
        }
    }

    double fom[NALGORITHMS];
    unsigned rank[NALGORITHMS] = {0};
    for (int alg = 0; alg < NALGORITHMS; alg++) {
        double mean = F[alg] / nBuckets;
        fom[alg] = sqrt(FF[alg] / nBuckets - mean * mean);
        for (int j = 0; j < alg; j++)
            if (fom[j] != fom[alg])
                (fom[j] > fom[alg]) ? ++rank[j] : ++rank[alg];
    }

    printf("Algorithm:");
    for (int alg = 0; alg < NALGORITHMS; alg++)
        printf(" %5s", hashes[alg].name);
    printf("\n");

    for (unsigned u = 0; u < nfreqs; u++) {
        printf(" %6u   ", u);
        for (int alg = 0; alg < NALGORITHMS; alg++)
            printf(" %5u", freq[alg][u]);
        printf("\n");
    }
    printf("\n");

    printf("Std.Dev.  ");
    for (int alg = 0; alg < NALGORITHMS; alg++) {
        printf(" %5.3f", stdDev[alg]);
    }
    printf("\n");

    printf("FigOfMerit");
    for (int alg = 0; alg < NALGORITHMS; alg++) {
        printf(" %5.3f", fom[alg]);
    }
    printf("\n\n");

    printf("FoM Rank  ");
    for (int alg = 0; alg < NALGORITHMS; alg++) {
        printf(" %5u", rank[alg]);
    }
    printf("\n");

    return 0;
}

Replies:: RE: String hash functions and resourceLib.h Jeff Hill

Navigate by Date:: Prev: RE: SIGALRM => SIGUSR1? Jeff Hill; Next: RE: String hash functions and resourceLib.h Jeff Hill; Index: 2002 2003 2004 2005 2006 2007 2008 <2009> 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024
Navigate by Thread:: Prev: RE: SIGALRM => SIGUSR1? Jeff Hill; Next: RE: String hash functions and resourceLib.h Jeff Hill; Index: 2002 2003 2004 2005 2006 2007 2008 <2009> 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024