I don't think the accepted answer is really representative of cPython's internal hash implementations, which can be found in pyhash.c
:
Description of hashing algorithm for numeric types:
/* For numeric types, the hash of a number x is based on the reduction
of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
hash(x) == hash(y) whenever x and y are numerically equal, even if
x and y have different types.
A quick summary of the hashing strategy:
(1) First define the 'reduction of x modulo P' for any rational
number x; this is a standard extension of the usual notion of
reduction modulo P for integers. If x == p/q (written in lowest
terms), the reduction is interpreted as the reduction of p times
the inverse of the reduction of q, all modulo P; if q is exactly
divisible by P then define the reduction to be infinity. So we've
got a well-defined map
reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
(2) Now for a rational number x, define hash(x) by:
reduce(x) if x >= 0
-reduce(-x) if x < 0
If the result of the reduction is infinity (this is impossible for
integers, floats and Decimals) then use the predefined hash value
_PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
_PyHASH_INF and -_PyHASH_INF are also used for the
hashes of float and Decimal infinities.
NaNs hash with a pointer hash. Having distinct hash values prevents
catastrophic pileups from distinct NaN instances which used to always
have the same hash value but would compare unequal.
A selling point for the above strategy is that it makes it possible
to compute hashes of decimal and binary floating-point numbers
efficiently, even if the exponent of the binary or decimal number
is large. The key point is that
reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
binary or decimal float is never infinity, since the denominator is a power
of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
for nonnegative x,
reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
and reduce(10**e) can be computed efficiently by the usual modular
exponentiation algorithm. For reduce(2**e) it's even better: since
P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
*/
Hashing of doubles:
Py_hash_t
_Py_HashDouble(double v)
{
int e, sign;
double m;
Py_uhash_t x, y;
if (!Py_IS_FINITE(v)) {
if (Py_IS_INFINITY(v))
return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
else
return _PyHASH_NAN;
}
m = frexp(v, &e);
sign = 1;
if (m < 0) {
sign = -1;
m = -m;
}
/* process 28 bits at a time; this should work well both for binary
and hexadecimal floating point. */
x = 0;
while (m) {
x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
m *= 268435456.0; /* 2**28 */
e -= 28;
y = (Py_uhash_t)m; /* pull out integer part */
m -= y;
x += y;
if (x >= _PyHASH_MODULUS)
x -= _PyHASH_MODULUS;
}
/* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
x = x * sign;
if (x == (Py_uhash_t)-1)
x = (Py_uhash_t)-2;
return (Py_hash_t)x;
}
Hashing of pointers:
Py_hash_t
_Py_HashPointerRaw(const void *p)
{
size_t y = (size_t)p;
/* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
excessive hash collisions for dicts and sets */
y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
return (Py_hash_t)y;
}
Py_hash_t
_Py_HashPointer(const void *p)
{
Py_hash_t x = _Py_HashPointerRaw(p);
if (x == -1) {
x = -2;
}
return x;
}
Hashing of bytes for very short strings uses DJBX33A, otherwise uses the default hash:
Py_hash_t
_Py_HashBytes(const void *src, Py_ssize_t len)
{
Py_hash_t x;
/*
We make the hash of the empty string be 0, rather than using
(prefix ^ suffix), since this slightly obfuscates the hash secret
*/
if (len == 0) {
return 0;
}
#ifdef Py_HASH_STATS
hashstats[(len <= Py_HASH_STATS_MAX) ? len : 0]++;
#endif
#if Py_HASH_CUTOFF > 0
if (len < Py_HASH_CUTOFF) {
/* Optimize hashing of very small strings with inline DJBX33A. */
Py_uhash_t hash;
const unsigned char *p = src;
hash = 5381; /* DJBX33A starts with 5381 */
switch(len) {
/* ((hash << 5) + hash) + *p == hash * 33 + *p */
case 7: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 6: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 5: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 4: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 3: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 2: hash = ((hash << 5) + hash) + *p++; /* fallthrough */
case 1: hash = ((hash << 5) + hash) + *p++; break;
default:
Py_UNREACHABLE();
}
hash ^= len;
hash ^= (Py_uhash_t) _Py_HashSecret.djbx33a.suffix;
x = (Py_hash_t)hash;
}
else
#endif /* Py_HASH_CUTOFF */
x = PyHash_Func.hash(src, len);
if (x == -1)
return -2;
return x;
}
The file also implements modified FNV hashing:
#if Py_HASH_ALGORITHM == Py_HASH_FNV
/* **************************************************************************
* Modified Fowler-Noll-Vo (FNV) hash function
*/
static Py_hash_t
fnv(const void *src, Py_ssize_t len)
{
const unsigned char *p = src;
Py_uhash_t x;
Py_ssize_t remainder, blocks;
union {
Py_uhash_t value;
unsigned char bytes[SIZEOF_PY_UHASH_T];
} block;
#ifdef Py_DEBUG
assert(_Py_HashSecret_Initialized);
#endif
remainder = len % SIZEOF_PY_UHASH_T;
if (remainder == 0) {
/* Process at least one block byte by byte to reduce hash collisions
* for strings with common prefixes. */
remainder = SIZEOF_PY_UHASH_T;
}
blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
x ^= (Py_uhash_t) *p << 7;
while (blocks--) {
PY_UHASH_CPY(block.bytes, p);
x = (_PyHASH_MULTIPLIER * x) ^ block.value;
p += SIZEOF_PY_UHASH_T;
}
/* add remainder */
for (; remainder > 0; remainder--)
x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
x ^= (Py_uhash_t) len;
x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
if (x == -1) {
x = -2;
}
return x;
}
static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
16 * SIZEOF_PY_HASH_T};
#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
According to PEP 456, SipHash (MIT License) is the default string and bytes hash algorithm:
/* byte swap little endian to host endian
* Endian conversion not only ensures that the hash function returns the same
* value on all platforms. It is also required to for a good dispersion of
* the hash values' least significant bits.
*/
#if PY_LITTLE_ENDIAN
# define _le64toh(x) ((uint64_t)(x))
#elif defined(__APPLE__)
# define _le64toh(x) OSSwapLittleToHostInt64(x)
#elif defined(HAVE_LETOH64)
# define _le64toh(x) le64toh(x)
#else
# define _le64toh(x) (((uint64_t)(x) << 56) | \
(((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
(((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
(((uint64_t)(x) << 8) & 0xff00000000ULL) | \
(((uint64_t)(x) >> 8) & 0xff000000ULL) | \
(((uint64_t)(x) >> 24) & 0xff0000ULL) | \
(((uint64_t)(x) >> 40) & 0xff00ULL) | \
((uint64_t)(x) >> 56))
#endif
#ifdef _MSC_VER
# define ROTATE(x, b) _rotl64(x, b)
#else
# define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
#endif
#define HALF_ROUND(a,b,c,d,s,t) \
a += b; c += d; \
b = ROTATE(b, s) ^ a; \
d = ROTATE(d, t) ^ c; \
a = ROTATE(a, 32);
#define DOUBLE_ROUND(v0,v1,v2,v3) \
HALF_ROUND(v0,v1,v2,v3,13,16); \
HALF_ROUND(v2,v1,v0,v3,17,21); \
HALF_ROUND(v0,v1,v2,v3,13,16); \
HALF_ROUND(v2,v1,v0,v3,17,21);
static uint64_t
siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
uint64_t b = (uint64_t)src_sz << 56;
const uint64_t *in = (uint64_t*)src;
uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
uint64_t v3 = k1 ^ 0x7465646279746573ULL;
uint64_t t;
uint8_t *pt;
uint8_t *m;
while (src_sz >= 8) {
uint64_t mi = _le64toh(*in);
in += 1;
src_sz -= 8;
v3 ^= mi;
DOUBLE_ROUND(v0,v1,v2,v3);
v0 ^= mi;
}
t = 0;
pt = (uint8_t *)&t;
m = (uint8_t *)in;
switch (src_sz) {
case 7: pt[6] = m[6]; /* fall through */
case 6: pt[5] = m[5]; /* fall through */
case 5: pt[4] = m[4]; /* fall through */
case 4: memcpy(pt, m, sizeof(uint32_t)); break;
case 3: pt[2] = m[2]; /* fall through */
case 2: pt[1] = m[1]; /* fall through */
case 1: pt[0] = m[0]; /* fall through */
}
b |= _le64toh(t);
v3 ^= b;
DOUBLE_ROUND(v0,v1,v2,v3);
v0 ^= b;
v2 ^= 0xff;
DOUBLE_ROUND(v0,v1,v2,v3);
DOUBLE_ROUND(v0,v1,v2,v3);
/* modified */
t = (v0 ^ v1) ^ (v2 ^ v3);
return t;
}
static Py_hash_t
pysiphash(const void *src, Py_ssize_t src_sz) {
return (Py_hash_t)siphash24(
_le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
src, src_sz);
}
uint64_t
_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
{
return siphash24(key, 0, src, src_sz);
}
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
#endif
Objects such as tuples in (in tupleobject.c
) have their own hashing methods. See the source for more examples:
static Py_hash_t
tuplehash(PyTupleObject *v)
{
Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Py_hash_t y;
Py_ssize_t len = Py_SIZE(v);
PyObject **p;
Py_uhash_t mult = _PyHASH_MULTIPLIER;
x = 0x345678UL;
p = v->ob_item;
while (--len >= 0) {
y = PyObject_Hash(*p++);
if (y == -1)
return -1;
x = (x ^ y) * mult;
/* the cast might truncate len; that doesn't change hash stability */
mult += (Py_hash_t)(82520UL + len + len);
}
x += 97531UL;
if (x == (Py_uhash_t)-1)
x = -2;
return x;
}