Skip to content

Commit 49ad1fc

Browse files
author
Dan Lecocq
committed
std::vector implementation
1 parent a3e3a7b commit 49ad1fc

8 files changed

+151
-625
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ driver
33
test
44
*.dSYM
55
*.cpp
6+
bench
7+
data/*

Makefile

+22-7
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,30 @@
11
CPP = g++
2-
CPPOPTS = -O9 -Wall -Iinclude -I/Users/dan/playground/hat-trie-chris/hat-trie/src
2+
CPPOPTS = -O3 -Wall -Iinclude -I/Users/dan/playground/hat-trie-chris/hat-trie/src
33

44
all: test bench
55

6-
bench: include/*.hpp bench.cpp
7-
$(CPP) $(CPPOPTS) bench.cpp -o bench
6+
# Retrieve some of the test data from Dr. Askitis
7+
data/distinct_1:
8+
# Create a directory
9+
mkdir -p data
10+
# Downloading the dataset
11+
curl http://www.naskitis.com/distinct_1.bz2 -o data/distinct_1.bz2
12+
# Unpacking the data
13+
bunzip2 data/distinct_1.bz2
14+
# Splitting the data up into the various inputs
15+
for i in {1,2,4,6,8,{1,2,3,4,5,6,7,8,9}{0,5}}00 ; do
16+
head -n $i data/distinct_1 > data/distinct_$i
17+
done
18+
for i in {1,2,3,4,5,6,7,8,9}{0,5}000 ; do
19+
head -n $i data/distinct_1 > data/distinct_$i
20+
done
821

9-
clean:
10-
rm -rdf *.o test bench *.dSYM
22+
bench: include/*.hpp bench.cpp data/distinct_1
23+
$(CPP) $(CPPOPTS) bench.cpp -o bench
24+
./bench ./data/distinct_{1,2,4,6,8,{1,2,3,4,5,6,7,8,9}{0,5}}00 ./data/distinct_{1,2,3,4,5,6,7,8,9}{0,5}000
1125

1226
test: include/*.hpp test.cpp
1327
$(CPP) $(CPPOPTS) -Ideps/Catch/single_include test.cpp -o test
14-
# Now invoke
15-
#./test
28+
29+
clean:
30+
rm -rdf *.o test bench *.dSYM

include/align.hpp

-14
This file was deleted.

include/array-hash.hpp

+51-195
Original file line numberDiff line numberDiff line change
@@ -9,24 +9,27 @@
99
#ifndef HT__ARRAY_HASH_H
1010
#define HT__ARRAY_HASH_T
1111

12+
#include <vector>
13+
#include <string>
14+
1215
#include <hash.hpp>
1316

14-
#include <cstdlib>
15-
#include <cstring>
1617
#include <iostream>
1718

1819
namespace ht {
1920
template <
2021
class Value,
21-
class Hash=superfast,
22-
class Allocator=std::allocator<char> >
22+
class Hash=crapwow,
23+
class Allocator=std::allocator<std::pair<std::string,Value> > >
2324
class ArrayHash {
2425
public:
2526
/* Some typedefs */
26-
typedef size_t size_type;
27-
typedef Value value_type;
28-
typedef Hash hash_type;
29-
typedef Allocator allocator_type;
27+
typedef size_t size_type;
28+
typedef std::string key_type;
29+
typedef Value value_type;
30+
typedef std::pair<key_type,value_type> pair_type;
31+
typedef Hash hash_type;
32+
typedef Allocator allocator_type;
3033

3134
/* Share the hash function object across all such classes */
3235
static const hash_type hasher;
@@ -37,15 +40,9 @@ namespace ht {
3740
/* Allocate the number of bins that we'll need. For this initial
3841
* allocation, we'll actually use malloc, but for the allocations of
3942
* the contents of each bin, we'll use the allocator */
40-
std::allocator<char*> tmp;
41-
try {
42-
bins = tmp.allocate(num_bins);
43-
for (size_type i = 0; i < num_bins; ++i) {
44-
bins[i] = NULL;
45-
}
46-
} catch(std::bad_alloc e) {
43+
bins = new std::vector<pair_type>[b];
44+
if (bins == NULL) {
4745
std::cout << "Could not allocate" << std::endl;
48-
bins = NULL;
4946
}
5047
}
5148

@@ -54,15 +51,7 @@ namespace ht {
5451
/* Bail out early -- nothing to see here */
5552
return;
5653
}
57-
58-
for (size_type i = 0; i < num_bins; ++i) {
59-
if (bins[i] == NULL) {
60-
continue;
61-
}
62-
/* Now, we have to deallocate the char * we were given */
63-
free(static_cast<void*>(bins[i]));
64-
bins[i] = NULL;
65-
}
54+
delete[] bins;
6655
}
6756

6857
/**
@@ -75,208 +64,75 @@ namespace ht {
7564
*
7665
* If the key doesn't exist, create it, and return a reference to where
7766
* we'd store it */
78-
value_type& operator[](const char* key) {
79-
value_type& ref(get(key, strlen(key)));
80-
return ref;
81-
}
82-
8367
value_type& operator[](const std::string& key) {
84-
return get(key.c_str(), key.length());
85-
}
86-
87-
/* Retrieve only
88-
*
89-
* Supports binary keys by providing a length, too. Missing keys are
90-
* automatically inserted, and a reference to the new item is returned
91-
*/
92-
value_type& get(const char* k, size_type len) {
9368
/* We need to hash the function, see if it exists, and if it already
9469
* exists in the value, when we'll update the value and return a
9570
* reference. If it doesn't, we'll append. If that bin is NULL, then
9671
* we'll go ahead and allocate it already */
97-
size_type position = hasher(k, len) % num_bins;
98-
char* bin = bins[position];
99-
if (bin == NULL) {
100-
/* Allocate some space, set, return early. We need enough for a
101-
* size_type for how many items are in the list, enough for the
102-
* key, and then enough for a copy of the value */
103-
size_type new_len = sizeof(size_type) + sizeof(size_type) +
104-
aligned(len) + aligned(sizeof(value_type));
105-
bin = bins[position] = static_cast<char*>(malloc(new_len));
106-
//std::cout << " Allocating " << new_len << " for " << position << " into " << static_cast<void*>(bin) << std::endl;
107-
/* Now, we'll advance bin as we add more to it. First, set the
108-
* count to be just 1 */
109-
*(reinterpret_cast<size_type*>(bin)) = 1;
110-
bin += sizeof(size_type);
111-
return set_record(bin, k, len);
112-
}
113-
114-
/* If we've gotten this far, the bin existed -- we need to do a
115-
* linear scan to see if it's already in here. First, let's see how
116-
* many items are actually in this array */
117-
if (find_in_bin(bin, k, len)) {
118-
//std::cout << "Found record in bin" << std::endl;
119-
return get_value(bin);
72+
size_type position = hasher(key.c_str(), key.length()) % num_bins;
73+
std::vector<pair_type>& bin(bins[position]);
74+
/* I _believe_ there is a more C++-y way of doing this, but it's
75+
* late, and I'm not entirely sure */
76+
typename std::vector<pair_type>::iterator it(bin.begin());
77+
for (; it != bin.end(); ++it) {
78+
if (it->first == key) {
79+
/* Return a reference to the value */
80+
return it->second;
81+
}
12082
}
121-
122-
/* If we've gotten this far, the key doesn't exist in its table, in
123-
* which case we'll have to append it to the end of the character
124-
* array that we created. This will involve a reallocation.
125-
*
126-
* Apparently the std::allocator doesn't actually use the hint
127-
* provided by `allocate`, but we'll give it a shot anyways. We may
128-
* eventually want to use this with an allocator that is aware of
129-
* realloc
130-
*
131-
* In order to determine how many items we need to declare, we have
132-
* to compare to the original pointer, and then add how much space
133-
* we'll need for this appended item.
134-
*/
135-
//std::cout << "New record in existing bin" << std::endl;
136-
size_type old_len = bin - bins[position];
137-
size_type new_len = old_len + sizeof(size_type) + aligned(len)
138-
+ aligned(sizeof(value_type));
139-
/* We'll make use of hint */
140-
//std::cout << "Deallocating " << old_len << " for " << position << " out of " << static_cast<void*>(bins[position]) << std::endl;
141-
//std::cout << " Allocating " << new_len << " for " << position << " into " << static_cast<void*>(bin) << std::endl;
142-
bin = static_cast<char*>(realloc(bins[position], new_len));
143-
// if (bin != bins[position]) {
144-
// //std::cout << "Realloc failed! " << std::endl;
145-
// /* Now, we'll copy, deallocate, swap */
146-
// // memcpy(bin, bins[position], old_len);
147-
// // free(static_cast<void*>(bins[position]));
148-
// } else {
149-
// //std::cout << "Realloc to the rescue!" << std::endl;
150-
// }
151-
//std::cout << "New position: " << reinterpret_cast<void*>(bin) << " old: " << reinterpret_cast<void*>(bins[position]) << std::endl;
152-
bins[position] = bin;
153-
/* Now increment the count by one, and then add the new value */
154-
++(*bin);
155-
bin += old_len;
156-
return set_record(bin, k, len);
83+
84+
/* If we didn't find one, we should add one */
85+
bin.push_back(make_pair(key, value_type()));
86+
return bin.back().second;
15787
}
15888

15989
/* Insert
16090
*
16191
* This interface is provided for both continuity and to support binary
16292
* data (where the length of the char* buffer is provided)
16393
*/
164-
value_type& insert(const char* key, const value_type& value) {
165-
return insert(key, strlen(key), value);
166-
}
167-
168-
value_type& insert(const char* k, size_type len, const value_type& v) {
169-
return (get(k, len) = v);
170-
}
171-
172-
value_type& insert(const std::string& key, const value_type& value) {
173-
return insert(key.c_str(), key.length(), value);
94+
value_type& insert(const std::string& key, const value_type& v) {
95+
value_type& ref(operator[](key));
96+
ref = v;
97+
return ref;
17498
}
17599

176100
/* Remove
177101
*
178102
* If the provided key exists, then it is removed. If the key does not
179103
* exist, it has no effect. Normally, it would return a reference, but
180104
* since it's possible the key doesn't exist, we can't. */
181-
void remove(const char* k) {
182-
remove(k, strlen(k));
183-
}
184-
185-
void remove(const char* k, size_type len) {
186-
size_type position = hasher(k, len) % num_bins;
187-
char* bin = bins[position];
188-
if (bin == NULL) { return; }
189-
190-
if (!find_in_bin(bin, k, len)) { return; }
191-
192-
/* Otherwise, we've got just a little bit of work to do. First
193-
* things first, we need to decrement the number of items we have
194-
* in the bin */
195-
//--(*reinterpret_cast<size_type*>(bin));
196-
}
197-
198105
void remove(const std::string& key) {
199-
remove(key.c_str(), key.length());
106+
size_type position = hasher(key.c_str(), key.length()) % num_bins;
107+
std::vector<pair_type>& bin(bins[position]);
108+
typename std::vector<pair_type>::iterator it(bin.begin());
109+
for (; it != bin.end(); ++it) {
110+
if (it->first == key) {
111+
bin.erase(it, it);
112+
return;
113+
}
114+
}
200115
}
201116

202117
/* Existence
203118
*
204119
* Returns true if the provided key exists, else, false */
205-
bool exists(const char* k) {
206-
return exists(k, strlen(k));
207-
}
208-
209-
bool exists(const char* k, size_type len) {
210-
size_type position = hasher(k, len) % num_bins;
211-
char* bin = bins[position];
212-
if (bin == NULL) { return false; }
213-
214-
return find_in_bin(bin, k, len);
215-
}
216-
217120
bool exists(const std::string& key) {
218-
return exists(key.c_str(), key.length());
219-
}
220-
221-
private:
222-
/* How many bins are we using? */
223-
size_type num_bins;
224-
/* We need an array of char*'s */
225-
char ** bins;
226-
227-
/* This is for byte alignment purposes. Turns out we have to align
228-
* things well */
229-
size_type aligned(size_type len, size_type multiple=8) {
230-
size_t remainder = len % multiple;
231-
if (remainder) {
232-
return len + multiple - remainder;
233-
}
234-
return len;
235-
}
236-
237-
/* Return a reference to the value at the provided record as returned by
238-
* find_in_bin */
239-
value_type& get_value(char* record) {
240-
size_type *len = reinterpret_cast<size_type* >(record);
241-
value_type* cpy = reinterpret_cast<value_type*>(record +
242-
sizeof(size_type) + aligned(*len));
243-
return *cpy;
244-
}
245-
246-
/* Fill in a record starting at the provided pointer, and return a
247-
* reference to the value_type stored in it */
248-
value_type& set_record(char* r, const char* k, size_type len) {
249-
/* Now, set the length of the string key that we're inserting */
250-
*(reinterpret_cast<size_type*>(r)) = len;
251-
r += sizeof(size_type);
252-
/* Now, copy the string into the new memory */
253-
memcpy(reinterpret_cast<void*>(r),
254-
reinterpret_cast<const void*>(k), len);
255-
r += aligned(len);
256-
/* Lastly, we have have to create a new value type */
257-
value_type* cpy = new (r) value_type();
258-
return *cpy;
259-
}
260-
261-
/* Find the start of a record in the provided bin. Returns NULL if not
262-
* found and returns a pointer starting where the length is encoded */
263-
bool find_in_bin(char*& bin, const char* k, size_type len) {
264-
size_type* count = reinterpret_cast<size_type*>(bin);
265-
/* Let's advance the pointer as we're moving along */
266-
bin += sizeof(size_type);
267-
for (size_type i = 0; i < *count; ++i) {
268-
/* If the lengths of the two keys aren't equal, then they can't
269-
* be equal */
270-
size_type* l = reinterpret_cast<size_type*>(bin);
271-
if (*l == len && !strncmp(bin + sizeof(size_type), k, len)) {
121+
size_type position = hasher(key.c_str(), key.length()) % num_bins;
122+
std::vector<pair_type>& bin(bins[position]);
123+
typename std::vector<pair_type>::iterator it(bin.begin());
124+
for (; it != bin.end(); ++it) {
125+
if (it->first == key) {
272126
return true;
273127
}
274-
/* Advance the pointer to just past this item */
275-
bin += (sizeof(size_type) + aligned(*l) +
276-
aligned(sizeof(value_type)));
277128
}
278129
return false;
279130
}
131+
private:
132+
/* How many bins are we using? */
133+
size_type num_bins;
134+
/* We need an array of char*'s */
135+
std::vector<pair_type>* bins;
280136
};
281137
}
282138

0 commit comments

Comments
 (0)