Hashing is very important for competitive programming. Normally,what we do in hashing is ,We compress the whole structure (string, may be an array) to a single number..(hash_value)..
Okay, All the different structure should have different hash value, To compute hash_value, we need a hash function that is computes a unique hash value for each..
Let assume A={1,2,3,4} , B= {1,2,3,3}, C = {1,2,3,4,5} , D ={1,2,3,4}; Than getHash(A) should be equal to getHash(D) as they are same. Same rules apply for String
Constructing a hash function is very simple
Step 1: Choose A prime as mod ,typically I use P = 1e7 + 7 Step 2: Choose A prime as base value (which need to greater than in entries of the structure) For String I use B = 131 Step 3: Using hornar’s rule , calculate H
Here it is :
This is shown for string, same rule applicable for vector, array
Okay, I use Polynomial representation of vector
Here h = s[0] * B^(n-1) + s[1] * B^(n-2) + S[2] * B^(n-3) ….. + S[n-1] * B^0
long long getHash(string s) { long long h = 0; for(int i = 0 ; s[i]; i ++ ){ h = (h * B + s[i]) % P; } return h; }
This hash_value is stored in a map … or set what you like to use which supports find or count operation…
Now I illustrate a simple problem with hashing
Problem : http://codeforces.com/contest/4/problem/C
Abridge statement: Given N strings, One by one, Process them ,
* If they have not occurred previously ,then print ‘OK’
* Else Print “The string + Previously occurred how many times”
Solution:
1. for each string , We compute its hash value ,Then check if it is already in the map ??
2. If yes, Then print The string + The number of times it occurred previously,
3. Else print NO
#include <bits/stdc++.h> #include <cstring> using namespace std; /*------- Constants---- */ #define LMT 15 #define ll long long #define ull unsigned long long #define mod 1000000007 #define MEMSET_INF 63 #define MEM_VAL 1061109567 #define FOR(i,n) for( int i=0 ; i < n ; i++ ) #define mp(i,j) make_pair(i,j) #define lop(i,a,b) for( int i = (a) ; i < (b) ; i++) #define pb(a) push_back((a)) #define gc getchar #define PI acos(-1.0) #define inf 1<<30 #define lc ((n)<<1) #define rc ((n)<<1 |1) #define GREATER(x, y) ((x) > (y) + EPS) #define GREATER_EQUAL(x, y) ((x) > (y) - EPS) #define LESS(x, y) ((x) < (y) - EPS) #define LESS_EQUAL(x, y) ((x) < (y) + EPS) #define EQUAL(x, y) (abs((x) - (y)) < EPS) #define msg(x) cout << x << endl /* -------Global Variables ------ */ ll euclid_x,euclid_y,d; /*---- short Cuts ------- */ #define ms(ara_name,value) memset(ara_name,value,sizeof(ara_name)) typedef pair<int, int> ii; typedef vector<int > vi ; /*------ template functions ------ */ inline void sc(int &x) { register int c = gc(); x = 0; int neg = 0; for(;((c<48 | c>57) && c != '-');c = gc()); if(c=='-') {neg=1;c=gc();} for(;c>47 && c<58;c = gc()) {x = (x<<1) + (x<<3) + c - 48;} if(neg) x=-x; } template <class T> inline T bigmod(T p,T e,T M){ ll ret = 1; for(; e > 0; e >>= 1){ if(e & 1) ret = (ret * p) % M; p = (p * p) % M; } return (T)ret; } template <class T> inline T gcd(T a,T b){if(b==0)return a;return gcd(b,a%b);} template <class T> inline T modinverse(T a,T M){return bigmod(a,M-2,M);} /*************************** END OF TEMPLATE ****************************/ string str; map<ll , int> kmp; ll B = 131; ll P = 1e7 + 7; long long getHash(string s) { long long h = 0; for(int i = 0 ; s[i]; i ++ ){ h = (h * B + s[i]) % P; } return h; } int main() { int n; sc(n) ; for(int i = 0 ; i < n ; i ++ ) { cin >> str; ll k = getHash(str); if( kmp.count(k) == 0 ) { printf("OK\n"); } else { cout << str ; printf("%d\n",kmp[k]); } kmp[k] ++; } return 0; }
Yaahhh..
This is a wrong code… Why ?? Because My hash function is simple but it is producing some collision. So we need a way out…
1. if two strings have same length, Then the probability of collision is very very small. So we can do , We use a length wise map…
2.Use double hashing
3. Others ,I don’t know
Approach 1:
In the above solution we were using one map, But now we will use an array of map..
How many Map I need? Maximum Length
So This is an accepted code:
#include <bits/stdc++.h> #include <cstring> using namespace std; /*------- Constants---- */ #define LMT 15 #define ll long long #define ull unsigned long long #define mod 1000000007 #define MEMSET_INF 63 #define MEM_VAL 1061109567 #define FOR(i,n) for( int i=0 ; i < n ; i++ ) #define mp(i,j) make_pair(i,j) #define lop(i,a,b) for( int i = (a) ; i < (b) ; i++) #define pb(a) push_back((a)) #define gc getchar #define PI acos(-1.0) #define inf 1<<30 #define lc ((n)<<1) #define rc ((n)<<1 |1) #define GREATER(x, y) ((x) > (y) + EPS) #define GREATER_EQUAL(x, y) ((x) > (y) - EPS) #define LESS(x, y) ((x) < (y) - EPS) #define LESS_EQUAL(x, y) ((x) < (y) + EPS) #define EQUAL(x, y) (abs((x) - (y)) < EPS) #define msg(x) cout << x << endl /* -------Global Variables ------ */ ll euclid_x,euclid_y,d; /*---- short Cuts ------- */ #define ms(ara_name,value) memset(ara_name,value,sizeof(ara_name)) typedef pair<int, int> ii; typedef vector<int > vi ; /*------ template functions ------ */ inline void sc(int &x) { register int c = gc(); x = 0; int neg = 0; for(;((c<48 | c>57) && c != '-');c = gc()); if(c=='-') {neg=1;c=gc();} for(;c>47 && c<58;c = gc()) {x = (x<<1) + (x<<3) + c - 48;} if(neg) x=-x; } template <class T> inline T bigmod(T p,T e,T M){ ll ret = 1; for(; e > 0; e >>= 1){ if(e & 1) ret = (ret * p) % M; p = (p * p) % M; } return (T)ret; } template <class T> inline T gcd(T a,T b){if(b==0)return a;return gcd(b,a%b);} template <class T> inline T modinverse(T a,T M){return bigmod(a,M-2,M);} /*************************** END OF TEMPLATE ****************************/ string str; map<ll , int> kmp[LMT]; ll B = 131; ll P = 1e7 + 7; long long getHash(string s) { long long h = 0; for(int i = 0 ; s[i]; i ++ ){ h = (h * B + s[i]) % P; } return h; } int main() { int n; sc(n) ; for(int i = 0 ; i < n ; i ++ ) { cin >> str; ll k = getHash(str); if( kmp[str.length()].count(k) == 0 ) { printf("OK\n"); } else { cout << str ; printf("%d\n",kmp[str.length()][k]); } kmp[str.length()][k] ++; } return 0; }
Approach 2: Double Hashing
We use Two mod instead of 1.
so now P1 = 1e7+7 , P2 = 1e9+9;
now store pair of values for them,,
Here is the code
#include <bits/stdc++.h> #include <cstring> using namespace std; /*------- Constants---- */ #define LMT 15 #define ll long long #define ull unsigned long long #define mod 1000000007 #define MEMSET_INF 63 #define MEM_VAL 1061109567 #define FOR(i,n) for( int i=0 ; i < n ; i++ ) #define mp(i,j) make_pair(i,j) #define lop(i,a,b) for( int i = (a) ; i < (b) ; i++) #define pb(a) push_back((a)) #define gc getchar #define PI acos(-1.0) #define inf 1<<30 #define lc ((n)<<1) #define rc ((n)<<1 |1) #define GREATER(x, y) ((x) > (y) + EPS) #define GREATER_EQUAL(x, y) ((x) > (y) - EPS) #define LESS(x, y) ((x) < (y) - EPS) #define LESS_EQUAL(x, y) ((x) < (y) + EPS) #define EQUAL(x, y) (abs((x) - (y)) < EPS) #define msg(x) cout << x << endl /* -------Global Variables ------ */ ll euclid_x,euclid_y,d; /*---- short Cuts ------- */ #define ms(ara_name,value) memset(ara_name,value,sizeof(ara_name)) typedef pair<int, int> ii; typedef vector<int > vi ; /*------ template functions ------ */ inline void sc(int &x) { register int c = gc(); x = 0; int neg = 0; for(;((c<48 | c>57) && c != '-');c = gc()); if(c=='-') {neg=1;c=gc();} for(;c>47 && c<58;c = gc()) {x = (x<<1) + (x<<3) + c - 48;} if(neg) x=-x; } template <class T> inline T bigmod(T p,T e,T M){ ll ret = 1; for(; e > 0; e >>= 1){ if(e & 1) ret = (ret * p) % M; p = (p * p) % M; } return (T)ret; } template <class T> inline T gcd(T a,T b){if(b==0)return a;return gcd(b,a%b);} template <class T> inline T modinverse(T a,T M){return bigmod(a,M-2,M);} /*************************** END OF TEMPLATE ****************************/ string str; map<pair<ll, ll>, int > kmp; ll B = 131; ll P1 = 1e7 + 7; ll P2 = 1e9+9; pair<ll , ll> getHash(string s) { long long h1 = 0 , h2 = 0; for(int i = 0 ; s[i]; i ++ ){ h1 = (h1 * B + s[i]) % P1; h2 = (h2 * B + s[i]) % P2; } return mp(h1,h2); } int main() { int n; sc(n) ; for(int i = 0 ; i < n ; i ++ ) { cin >> str; pair<ll, ll> k = getHash(str); if( kmp.count(k) == 0 ) { printf("OK\n"); } else { cout << str ; printf("%d\n",kmp[k]); } kmp[k] ++; } return 0; }
We will discuss some other problem in other day 🙂