I have a huge graph in edgelist format with strings as node labels. I wonder what is the "best" way to map strings to integers. Input file follows the example:
Mike Andrew
Mike Jane
John Jane
The output (i.e., a mapped file) should be:
1 2
1 3
4 3
Pasted below is a skeleton in C that reads the input file. Could somebody please advice me how to proceed.
#include <stdio.h>
int LoadFile(const char * filename) {
FILE *fp = NULL;
char node1[10];
char node2[10];
int idx = 0;
fp = fopen(filename, "r");
if (fp == NULL) {
perror("Error");
}
while (fscanf(fp, "%s %s", &node1, &node2) == 2) {
idx++;
}
fclose(fp);
return idx;
}
int main(void) {
int n = LoadFile("./test.txt");
printf("Number of edges: %d\n", n);
return 0;
}
You need to have naive implementation of a map (mapping strings to integers).
Define a structure as below to store the strings.
typedef struct {
unsigned int hashed;
char **map;
} hash;
Define a function which will insert the string into hashmap if it is not exists and return the index of string in hashmap.
int insertInMap(hash *map, char *entry)
Store the returned index into edge
structure.
edges[i].first =insertInMap(&map,first_string);
edges[i].second =insertInMap(&map,second_string)
Example code:
typedef struct {
unsigned int first;
unsigned int second;
} edge;
typedef struct {
unsigned int hashed;
char **map;
} hash;
int insertInMap(hash *map, char *entry)
{
int i =0;
for (i=0;i<map->hashed;i++)
{
if (strcmp(map->map[i],entry) == 0)
return i+1;
}
/* Warning no boundary check is added */
map->map[map->hashed++] = strdup(entry);
return map->hashed;
}
edge *LoadFile(const char * filename) {
FILE *fp = NULL;
char node1[10];
char node2[10];
int idx = 0;
edge *edges;
hash map;
int numEdges = 10;
edges = malloc( numEdges * sizeof(edge));
map.map = malloc(numEdges * sizeof(char*));
map.hashed = 0;
fp = fopen(filename, "r");
if (fp == NULL) {
perror("Error");
}
while (fscanf(fp, "%s %s", &node1, &node2) == 2) {
if (idx >= numEdges)
{
numEdges *=2;
edges = realloc(edges, numEdges * sizeof(edge));
map.map = realloc(map.map, numEdges * sizeof(char*));
}
edges[idx].first =insertInMap(&map,node1);
edges[idx].second =insertInMap(&map,node2);
idx++;
}
fclose(fp);
return edges;
}
Later print the edges
.
I would suggest that you use a Trie data structure. It is designed to store words and associate them a value.
The advantages of a trie over a hashmap are the following:
- Lookup for an element is faster
- No collisions
- Easy ways to traverse the trie or return all value by alphabetical order
- Straight-forward implementation (no hash function, no linked lists ...). It's a simple tree.
The memory use is generally lower in a trie than in a hash table, but in the worst case it will use more memory.
An even more efficient data structure for this purpose is the DAWG (or deterministic acyclic finite state automaton), but its construction is much more complex, so if you do not have millions of nodes in your graph I'd suggest you stick to the Trie.
A possible implementation in C would be the following:
Data structure:
#include <stdlib.h>
#include <stdio.h>
#define ALPHABET_SIZE 26
#define IMPOSSIBLE_VALUE -42
typedef struct TrieNode_struct {
struct TrieNode_struct *children[ALPHABET_SIZE];
int value;
} TrieNode_t;
typedef TrieNode_t *Trie_t;
TrieNode_t *new_node() {
TrieNode_t *new_node = malloc(sizeof(TrieNode_t));
new_node->value = IMPOSSIBLE_VALUE;
for (int i = 0; i < ALPHABET_SIZE; i++) {
new_node->children[i] = NULL;
}
return new_node;
}
int char_to_idx(char c){
return c - 'a';
}
Insert a string/value couple in the trie
void trie_insert_rec(TrieNode_t *node, char *str, int val, int depth) {
if (str[depth] == '\0') {
node->value = val;
} else {
if (node->children[char_to_idx(str[depth])] == NULL) {
node->children[char_to_idx(str[depth])] = new_node();
}
trie_insert_rec(node->children[char_to_idx(str[depth])], str, val, depth+1);
}
}
void trie_insert(Trie_t trie, char *str, int val) {
trie_insert_rec(trie, str, val, 0);
}
Search for a value in the trie:
int trie_fetch_rec(TrieNode_t *node, char *str, int depth) {
if (str[depth] == '\0') {
return node->value;
} else if (node->children[char_to_idx(str[depth])] == NULL) {
return IMPOSSIBLE_VALUE;
} else {
return trie_fetch_rec(node->children[char_to_idx(str[depth])], str, depth+1);
}
}
int trie_fetch(TrieNode_t *node, char *str){
return trie_fetch_rec(node, str, 0);
}
Tiny toy-test
int main() {
Trie_t trie = new_node();
char str[5] = "john\0";
trie_insert(trie, str, 11);
printf("%d\n", trie_fetch(trie, str));
}