映射
- HashTable
class HashTable(object):
_empty = object()
_deleted = object()
def __init__(self, size=11):
self.size = size
self._len = 0
self._keys = [self._empty] * size # keys
self._values = [self._empty] * size # values
def put(self, key, value):
initial_hash = hash_ = self.hash(key)
while True:
if self._keys[hash_] is self._empty or self._keys[hash_] is self._deleted:
# can assign to hash_ index
self._keys[hash_] = key
self._values[hash_] = value
self._len += 1
return
elif self._keys[hash_] == key:
# key already exists here, assign over
self._keys[hash_] = key
self._values[hash_] = value
return
hash_ = self._rehash(hash_)
if initial_hash == hash_:
# table is full
raise ValueError("Table is full")
def get(self, key):
initial_hash = hash_ = self.hash(key)
while True:
if self._keys[hash_] is self._empty:
# That key was never assigned
return None
elif self._keys[hash_] == key:
# key found
return self._values[hash_]
hash_ = self._rehash(hash_)
if initial_hash == hash_:
# table is full and wrapped around
return None
def del_(self, key):
initial_hash = hash_ = self.hash(key)
while True:
if self._keys[hash_] is self._empty:
# That key was never assigned
return None
elif self._keys[hash_] == key:
# key found, assign with deleted sentinel
self._keys[hash_] = self._deleted
self._values[hash_] = self._deleted
self._len -= 1
return
hash_ = self._rehash(hash_)
if initial_hash == hash_:
# table is full and wrapped around
return None
def hash(self, key):
return key % self.size
def _rehash(self, old_hash):
"""
linear probing
"""
return (old_hash + 1) % self.size
def __getitem__(self, key):
return self.get(key)
def __delitem__(self, key):
return self.del_(key)
def __setitem__(self, key, value):
self.put(key, value)
def __len__(self):
return self._len
class ResizableHashTable(HashTable):
MIN_SIZE = 8
def __init__(self):
super().__init__(self.MIN_SIZE)
def put(self, key, value):
super().put(key, value)
# increase size of dict * 2 if filled >= 2/3 size (like python dict)
if len(self) >= (self.size * 2) / 3:
self.__resize()
def __resize(self):
keys, values = self._keys, self._values
self.size *= 2 # this will be the new size
self._len = 0
self._keys = [self._empty] * self.size
self._values = [self._empty] * self.size
for key, value in zip(keys, values):
if key is not self._empty and key is not self._deleted:
self.put(key, value)
if __name__ == "__main__":
s = "hello"
print(hash(s))
- SeparateChainingHashTable
class Node(object):
def __init__(self, key=None, value=None, next=None):
self.key = key
self.value = value
self.next = next
class SeparateChainingHashTable(object):
_empty = None
def __init__(self, size=11):
self.size = size
self._len = 0
self._table = [self._empty] * size
def put(self, key, value):
hash_ = self.hash(key)
node_ = self._table[hash_]
if node_ is self._empty:
self._table[hash_] = Node(key, value)
else:
while node_.next is not None:
if node_.key == key:
node_.value = value
return
node_ = node_.next
node_.next = Node(key, value)
self._len += 1
def get(self, key):
hash_ = self.hash(key)
node_ = self._table[hash_]
while node_ is not self._empty:
if node_.key == key:
return node_.value
node_ = node_.next
return None
def del_(self, key):
hash_ = self.hash(key)
node_ = self._table[hash_]
pre_node = None
while node_ is not None:
if node_.key == key:
if pre_node is None:
self._table[hash_] = node_.next
else:
pre_node.next = node_.next
self._len -= 1
pre_node = node_
node_ = node_.next
def hash(self, key):
return hash(key) % self.size
def __len__(self):
return self._len
def __getitem__(self, key):
return self.get(key)
def __delitem__(self, key):
return self.del_(key)
def __setitem__(self, key, value):
self.put(key, value)
- 最大子序列
思路:记录下两个序列成员连续相等的index,存储最大长度
def max_common_sub_string(s1, s2):
# Assuming s2 has all unique chars
s2dic = {s2[i]: i for i in range(len(s2))}
maxr = 0
subs = ''
i = 0
while i < len(s1):
if s1[i] in s2dic:
j = s2dic[s1[i]]
k = i
while j < len(s2) and k < len(s1) and s1[k] == s2[j]:
k += 1
j += 1
if k - i > maxr:
maxr = k-i
subs = s1[i:k]
i = k
else:
i += 1
return subs
- 最大子成员
和子序列的区别,eg,s1 = "13456778",s2 = "357486782",最大子成员为['3', '5', '7', '7', '8']
思路:需要运用到动态规划的方式,首先初始化一个二维向量,行为s2的长度,列为s1的长度,遍历s1和s2的字符,如果相等,res[i][j] = res[i-1][j-1] + 1,如果不相等,max(res[i-1][j], res[i][j-1]);然后反向查找共有的字符
def longest_common_seq(s1, s2):
len1, len2 = len(s1), len(s2)
res = [[0]*(len2+1) for _ in range((len1+1))]
for j in range(1, len2+1):
for i in range(1, len1+1):
if s1[i-1] == s2[j-1]:
res[i][j] = res[i-1][j-1] + 1
else:
res[i][j] = max(res[i-1][j], res[i][j-1])
out = []
while len1 > 0 and len2 > 0:
if s1[len1-1] == s2[len2-1]:
out.append(s1[len1-1])
len1 -= 1
len2 -= 1
elif res[len1-1][len2] == res[len1][len2-1]:
len2 -= 1
elif res[len1-1][len2] > res[len1][len2-1]:
len1 -= 1
else:
len2 -= 1
out.reverse()
return out
if __name__ == "__main__":
s1 = "13456778"
s2 = "357486782"
assert longest_common_seq(s1, s2) == ['3', '5', '7', '7', '8']
- 布隆过滤器
import mmh3
from bitarray import bitarray
class BloomFilter(object):
def __init__(self, bit_size=5000000, seeds=[11, 12, 13, 14, 15]):
bit_array = bitarray(bit_size)
bit_array.setall(0)
self.bit_array = bit_array
self.seeds = seeds
self.bit_size = bit_size
def add(self, url):
for b in self.get_slots(url):
self.bit_array[b] = 1
def contains(self, url):
slots = self.get_slots(url)
return all([self.bit_array[b] for b in slots])
def hash(self, args):
return mmh3.hash(args[0], args[1]) % self.bit_size
def get_slots(self, url):
return map(self.hash, zip([url] * len(self.seeds), self.seeds))
if __name__ == "__main__":
b = BloomFilter()
url = "https://ne7ermore.github.io/"
print(b.contains(url))
b.add(url)
print(b.contains(url))