bitsquid: development blog

Thursday, August 25, 2011

Code Snippet: Murmur hash inverse / pre-image

Today's caring by sharing. I needed this non-trivial code snippet today and couldn't find it anywhere on the internet, so here it is for future reference. It computes the inverse / pre-image of a murmur hash. I. e., given a 32 bit murmur hash value, it computes a 32 bit value that when hashed produces that hash value:

/// Inverts a (h ^= h >> s) operation with 8 <= s <= 16
unsigned int invert_shift_xor(unsigned int hs, unsigned int s)
{
	XENSURE(s >= 8 && s <= 16);
	unsigned hs0 = hs >> 24;
	unsigned hs1 = (hs >> 16) & 0xff;
	unsigned hs2 = (hs >> 8) & 0xff;
	unsigned hs3 = hs & 0xff;

	unsigned h0 = hs0;
	unsigned h1 = hs1 ^ (h0 >> (s-8));
	unsigned h2 = (hs2 ^ (h0 << (16-s)) ^ (h1 >> (s-8))) & 0xff;
	unsigned h3 = (hs3 ^ (h1 << (16-s)) ^ (h2 >> (s-8))) & 0xff;
	return (h0<<24) + (h1<<16) + (h2<<8) + h3;
}

unsigned int murmur_hash_inverse(unsigned int h, unsigned int seed)
{
	const unsigned int m = 0x5bd1e995;
	const unsigned int minv = 0xe59b19bd;	// Multiplicative inverse of m under % 2^32
	const int r = 24;

	h = invert_shift_xor(h,15);
	h *= minv;
	h = invert_shift_xor(h,13);

	unsigned int hforward = seed ^ 4;
	hforward *= m;
	unsigned int k = hforward ^ h;
	k *= minv;
	k ^= k >> r;
	k *= minv;

	#ifdef PLATFORM_BIG_ENDIAN
		char *data = (char *)&k;
		k = (data[0]) + (data[1] << 8) + (data[2] << 16) + (data[3] << 24);
	#endif

	return k;
}

And for reference, here is the full code, with both the regular murmur hash and the inverses for 32- and 64-bit hashes:

unsigned int murmur_hash ( const void * key, int len, unsigned int seed )
{
	// 'm' and 'r' are mixing constants generated offline.
	// They're not really 'magic', they just happen to work well.

	const unsigned int m = 0x5bd1e995;
	const int r = 24;

	// Initialize the hash to a 'random' value

	unsigned int h = seed ^ len;

	// Mix 4 bytes at a time into the hash

	const unsigned char * data = (const unsigned char *)key;

	while(len >= 4)
	{
		#ifdef PLATFORM_BIG_ENDIAN
			unsigned int k = (data[0]) + (data[1] << 8) + (data[2] << 16) + (data[3] << 24);
		#else
			unsigned int k = *(unsigned int *)data;
		#endif

		k *= m;
		k ^= k >> r;
		k *= m;

		h *= m;
		h ^= k;

		data += 4;
		len -= 4;
	}

	// Handle the last few bytes of the input array

	switch(len)
	{
	case 3: h ^= data[2] << 16;
	case 2: h ^= data[1] << 8;
	case 1: h ^= data[0];
		h *= m;
	};

	// Do a few final mixes of the hash to ensure the last few
	// bytes are well-incorporated.

	h ^= h >> 13;
	h *= m;
	h ^= h >> 15;

	return h;
}

/// Inverts a (h ^= h >> s) operation with 8 <= s <= 16
unsigned int invert_shift_xor(unsigned int hs, unsigned int s)
{
	XENSURE(s >= 8 && s <= 16);
	unsigned hs0 = hs >> 24;
	unsigned hs1 = (hs >> 16) & 0xff;
	unsigned hs2 = (hs >> 8) & 0xff;
	unsigned hs3 = hs & 0xff;

	unsigned h0 = hs0;
	unsigned h1 = hs1 ^ (h0 >> (s-8));
	unsigned h2 = (hs2 ^ (h0 << (16-s)) ^ (h1 >> (s-8))) & 0xff;
	unsigned h3 = (hs3 ^ (h1 << (16-s)) ^ (h2 >> (s-8))) & 0xff;
	return (h0<<24) + (h1<<16) + (h2<<8) + h3;
}

unsigned int murmur_hash_inverse(unsigned int h, unsigned int seed)
{
	const unsigned int m = 0x5bd1e995;
	const unsigned int minv = 0xe59b19bd;	// Multiplicative inverse of m under % 2^32
	const int r = 24;

	h = invert_shift_xor(h,15);
	h *= minv;
	h = invert_shift_xor(h,13);

	unsigned int hforward = seed ^ 4;
	hforward *= m;
	unsigned int k = hforward ^ h;
	k *= minv;
	k ^= k >> r;
	k *= minv;

	#ifdef PLATFORM_BIG_ENDIAN
		char *data = (char *)&k;
		k = (data[0]) + (data[1] << 8) + (data[2] << 16) + (data[3] << 24);
	#endif

	return k;
}

uint64 murmur_hash_64(const void * key, int len, uint64 seed)
{
	const uint64 m = 0xc6a4a7935bd1e995ULL;
	const int r = 47;

	uint64 h = seed ^ (len * m);

	const uint64 * data = (const uint64 *)key;
	const uint64 * end = data + (len/8);

	while(data != end)
	{
		#ifdef PLATFORM_BIG_ENDIAN
			uint64 k = *data++;
			char *p = (char *)&k;
			char c;
			c = p[0]; p[0] = p[7]; p[7] = c;
			c = p[1]; p[1] = p[6]; p[6] = c;
			c = p[2]; p[2] = p[5]; p[5] = c;
			c = p[3]; p[3] = p[4]; p[4] = c;
		#else
			uint64 k = *data++;
		#endif

		k *= m;
		k ^= k >> r;
		k *= m;
		
		h ^= k;
		h *= m;
	}

	const unsigned char * data2 = (const unsigned char*)data;

	switch(len & 7)
	{
	case 7: h ^= uint64(data2[6]) << 48;
	case 6: h ^= uint64(data2[5]) << 40;
	case 5: h ^= uint64(data2[4]) << 32;
	case 4: h ^= uint64(data2[3]) << 24;
	case 3: h ^= uint64(data2[2]) << 16;
	case 2: h ^= uint64(data2[1]) << 8;
	case 1: h ^= uint64(data2[0]);
			h *= m;
	};
 
	h ^= h >> r;
	h *= m;
	h ^= h >> r;

	return h;
}

uint64 murmur_hash_64_inverse(uint64 h, uint64 seed)
{
	const uint64 m = 0xc6a4a7935bd1e995ULL;
	const uint64 minv = 0x5f7a0ea7e59b19bdULL; // Multiplicative inverse of m under % 2^64
	const int r = 47;

	h ^= h >> r;
	h *= minv;
	h ^= h >> r;
	h *= minv;

	uint64 hforward = seed ^ (8 * m);
	uint64 k = h ^ hforward;

	k *= minv;
	k ^= k >> r;
	k *= minv;

	#ifdef PLATFORM_BIG_ENDIAN
		char *p = (char *)&k;
		char c;
		c = p[0]; p[0] = p[7]; p[7] = c;
		c = p[1]; p[1] = p[6]; p[6] = c;
		c = p[2]; p[2] = p[5]; p[5] = c;
		c = p[3]; p[3] = p[4]; p[4] = c;
	#endif
	
	return k;
}

Wednesday, August 24, 2011

An idea for better watch windows

Watch windows suck. I’ve spent a large part of my career looking at them (that’s how those bugs get fixed) and it’s often a frustrating experience.

Visual Studio’s watch window is one of the better ones, but it still has many issues that make the debugging experience a lot less pleasant than it could be.

Custom data types such as MyTree, MyHashSet and MyLinkedList are difficult to look at. To get to the content you have to understand the internal data layout and expand the links by hand.
I like to pack my resource data into tight static blobs -- file formats for memory. A simple such blob might have a header with a variable number of offsets into a buffer of tightly packed strings. Such memory layouts cannot be described with just C structs and the watch window can’t inspect them. You have to cast pointers by hand or use the Memory view.

I don’t even see the code. All I see is a hermite curve fitted, time key sorted, zlib compressed reload animation.

If I have an array with 10 000 floats and one of them is a #NaN, I have no way of finding out except to expand it and scroll through the numbers until I find the bad one.
The watch window can’t do reverse lookup of string hashes, so when I see a hash value in the data I have no idea what it refers to.

Yes, I know that some of these things can be fixed. I know that you can get the Visual Studio Debugger to understand your own data types by editing autoexp.dat. And since I’ve done that for all our major collection types (Vector, Deque, Map, SortMap, HashMap, Set, SortSet, HashSet, ConstConfigValue and DynamicConfigValue) I know what a pain it is, and I know I don’t want to do it any more. Also, it doesn’t help the debuggers for the other platforms.

I also know that you can do some tricks with Visual Studio extensions. At my previous company we had reverse hash lookup through a Visual Studio extension. That was also painful to write, and a single platform solution.

So yes, you can fix some things and will make your work environment a little better. But I think we should aim higher.

Consider this: The variable watcher has access to the entire game memory and plenty of time to analyze it. (Variable watching is not a time critical task.)

Imagine what a well written C program that knew the layout of all your data structures could do with that information. It could expand binary trees and display them in a nice view, reverse lookup your hashes, highlight uninitialized 0xdeadbeef variables, spell check your strings, etc.

The idea

So this is my idea: instead of writing plug-ins and extensions for all the IDEs and platforms in the world, we write the watcher as a separate external program. The user starts the program, connects to a process, enters a memory address and a variable type and gets presented with a nice view of the data:

The connection backend would be customizable so that we could use it both for local processes and remote devices (Xbox/PS3). The front end sends an (address, size) request and the backend replies with a bunch of data. So the platform doesn’t matter. As long as there is some way of accessing the memory of the device we can connect it to the watcher.

We can even use it to look at file contents. All we need is a backend that can return data from different offsets in the file. This works especially well for data blobs, where the file and memory formats are identical. The watcher would function as a general data viewer that could be used for both files and memory.

For this to work, we need a way to describe our data structures to the program. It should understand regular C structs, of course, but we also need some way of describing more complex data, such as variable length objects, offsets, choices, etc. Essentially, what we need is a generic way to describe blobs of structured data, no matter what the format and layout.

I’m not sure what such a description language might look like (or if one already exists), but it might be something loosely based on C structs and then extended to cover more cases. Perhaps something like:

struct Data
{
	zero_terminated char[] name;
	pad_to_4_bytes_alignment;
	platform_endian unsigned count;
	Entry entries[count];
};

The program also needs an extension mechanism so that we can write custom code for processing objects that can’t be described using even this more advanced syntax. This could be used for things like reverse hash lookups, or other queries that depend on external data.

Going further the program could be extended with more visualizers that could allow you to view and edit complex objects in lots of interesting ways:

I think this could be a really useful tool, both for debugging and for inspecting files (as a sort of beefed up hex editor). All I need is some time to write it.

What do you think?

Tuesday, August 9, 2011

Fixing memory issues in Lua

Garbage collection can be both a blessing and a curse. On the one hand, it frees you from manually managing memory. This saves development time, reduces bugs, and avoids tricky decisions about objects' ownerships and lifetimes.

On the other hand, when you do run into memory issues (and you most likely will), they can be a lot harder to diagnose and fix, because you don't have detailed control over how memory is allocated and freed.

In this post I'll show some techniques that you can use to address memory issues in Lua (and by extension, in other garbage collected languages).

All Lua memory issues essentially boil down to one of two things:

Lua uses too much memory: On consoles memory is a precious resource and sometimes Lua is just using too much of it. The root cause can either be memory leaks or badly constructed/bloated data structures.
Garbage collection is taking too long: Too much garbage collection is (not surprisingly) caused by having too much garbage. The code must be rewritten so that it generates less garbage.

Let's look at each issue in turn and see how we can address it.

1. Lua uses too much memory

The first step towards plugging leaks and reducing memory use is to find out where the memory is going. Once we know that, the problems are usually quite easy to fix.

So how do we find out where the memory is going? One way would be to add tracing code to the lua_Alloc() function, but actually there is a much simpler method that doesn't require any C code and is more in line with Lua's dynamic nature. We can just use Lua to count all the objects in the runtime image:

function count_all(f)
	local seen = {}
	local count_table
	count_table = function(t)
		if seen[t] then return end
		f(t)
		seen[t] = true
		for k,v in pairs(t) do
			if type(v) == "table" then
				count_table(v)
			elseif type(v) == "userdata" then
				f(v)
			end
		end
	end
	count_table(_G)
end

Here we just start with the global table _G and recursively enumerate all subtables and userdata. For each object that we haven't seen before, we call the enumeration function f. This will enumerate all the objects in the Lua runtime that can be reached from _G. Depending on how you use Lua you may also want to add some code for enumerating objects stored in the registry, and recurse over metatables and function upvalues to make sure that you really count all the objects in the runtime.

Once you have a function for enumerating all your Lua objects, there are lots of useful things you can do. When it comes to plugging leaks and reducing memory usage I find one of the most useful things is to count the number of objects of each type:

function type_count()
	local counts = {}
	local enumerate = function (o)
		local t = type_name(o)
		counts[t] = (counts[t] or 0) + 1
	end
	count_all(enumerate)
	return counts
end

Here type_name() is a function that returns the name of an object's type. This function will depend on what kind of class/object system you use in your Lua runtime. One common approach is to have global class objects that also act as metatables for objects:

-- A class
Car = {}
Car.__index = Car

-- A method
function Car.honk(self)
	print "toot"
end

-- An object
local my_car = {}
setmetatable(my_car, Car)

In this case, the type_name() function could look something like this:

global_type_table = nil
function type_name(o)
	if global_type_table == nil then
		global_type_table = {}
		for k,v in pairs(_G) do
			global_type_table[v] = k
		end
		global_type_table[0] = "table"
	end
	return global_type_table[getmetatable(o) or 0] or "Unknown"
end

The object count usually gives you a good idea of where your memory problems lie. For example, if the number of AiPathNode objects constantly rises, you can conclude that you are somehow leaking those objects. If you have 200 000 GridCell objects you should write a smarter grid implementation.

You can also use this enumeration technique to pinpoint problems further if necessary. For example, if you are hunting for leaks, you can rewrite the count_all() function so that it keeps track of the sub keys where an object were found. In this way, you might see that the AiPathNode objects can be accessed through paths like:

_G.managers.ai_managers.active_paths[2027]

Then you know that the source of the leak is that paths never get removed from the active_paths table.

2. Garbage collection is taking too long

Garbage collection is a very cache unfriendly task that can have a significant performance impact. This is especially frustrating since garbage collection doesn't really do anything. Well, it lets your gameplay programmers work faster and with fewer bugs, but when you have reached the optimization phase you tend to forget about that and just swear at the slow collector.

Lua's default garbage collection scheme is not adapted for realtime software and if you just run it straight up you will get lots of disturbing frame rate hitches. As has already been mentioned in previous #AltDevBlogADay articles, it is better to use a step size of 0 and just run the garbage collector for a certain number of milliseconds every frame:

OpaqueTimeValue start = time();
while (milliseconds_elapsed_since(start) < milliseconds_to_run)
	lua_gc(L, LUA_GCSTEP, 0);

Note that you can run this garbage collection on any thread, as long as Lua is not running at the same time, so you might be able to offset some of the cost by running the garbage collection on a background thread while your main thread is doing something non-Lua related.

How much time should you spend on garbage collection? A tricky question. If you spend too little, the garbage will grow and you will eventually run out of memory. If you spend too much, you are wasting precious milliseconds.

My preferred solution is to use a feedback mechanism. I dynamically adjust the garbage collection time so that the amount of garbage always stays below 10 % of the total Lua memory. If the garbage goes above that, I increase the collection time. If the garbage goes below, I decrease the collection time. As with all feedback mechanisms is a good idea to plot the curves for memory use and garbage collection time as you tweak the feedback parameters. That way you can verify that the system behaves nicely and that the curves settle down in a stable state rather than going into oscillation.

Choosing the figure 10 % is a balance between memory use and performance. If you choose a higher value, your program will use more memory (because of the increased amount of garbage). On the other hand, you can give the garbage collection a smaller time slice. I've chosen a pretty low number, because on consoles, memory is always precious. If you are targeting a platform with more memory, you can go higher.

Let's compute how much time we need to spend on garbage collection to stay below a certain fraction 0 <= a <= 1 of garbage. Assume that we complete a full garbage collection cycle (scan all Lua memory) in time t. The amount of garbage generated in that time will be:

t g

Where g is the garbage/s created by the program. To make sure that we stay below a fraction a we must have (where m is the total memory used by the program, including the garbage):

t g <= a m

Assume that we sweep s bytes/s. Then the time t required to sweep the entire memory m will be:

t = m / s

Combining the two equations we get:

s <= g / a

So the amount of garbage collection work we need to do per frame is directly proportional to the amount of garbage / s generated by the program and inversely proportional to the fraction of garbage we are willing to accept. (Note that interestingly, m cancels out of the equation.)

So, if we are willing to spend more memory, we can address garbage collection problems by increasing a. But since a can never be higher than 1, there are limits to what we can achieve in this way. A better option, that doesn't cost any memory, is to reduce g -- the amount of garbage generated.

In my experience, most garbage generation problems are "easy mistakes" from sloppy and thoughtless programming. Once you know where the problems are, it is usually not hard to rewrite the code so that garbage generation is avoided. Some useful refactoring techniques are:

Update the fields in an existing table instead of creating a new one.
Return a reference to an object member rather than a copy. Copy only when needed.
Write functions so that they take and return values rather than tables to avoid temporary tables. I. e., make_point(2,3) rather than make_point({2,3}).
If you need temporary objects, find a way of reusing them so you don't need to create so many of them.
Avoid excessive string concatenation.

Of course a key requirement for this to work is that your Lua-to-C bindings are written so that they don't generate garbage. Otherwise your poor gameplay programmer has no chance. In my opinion, it should be possible to call any C function in a "garbage free" way (though you may choose to also have a more convenient path that does generate garbage). For tips on how to write garbage free bindings, see my previous posts on Lightweight Lua Bindings.

To reduce garbage generation, you need to be able to pinpoint where in the program the garbage is being generated. Luckily, that is not difficult.

Once the game has reached a stable state (total Lua memory doesn't grow or shrink) any allocation made can be considered garbage, because it will soon be freed again (otherwise the Lua memory would keep growing). So to find the garbage all you have to do is to add some tracing code to lua_Alloc that you can trigger when you have reached a stable state.

You can use lua_getstack() to get the current Lua stack trace from inside lua_Alloc and use a HashMap to count the number of allocations associated with each stack trace. If you then sort this data by the number of allocations it is easy to identify the "hotspots" that are generating the most garbage. A gameplay programmer can go through this list and reduce the amount of garbage generation using the tips above.

The code may look something like this:

struct TraceEntry {
	TraceEntry() : alloc_count(0), alloc_bytes(0) {}
	String trace;
	unsigned alloc_count;
	unsigned alloc_bytes;
};
HashMap<uint64, TraceEntry> _traces;

if (_tracing_allocs) {
	lua_Debug stack[5] = {0};
	int count = lua_debugger::stack_dump(L, stack, 5);
	uint64 hash = murmur_hash_64(&stack[0], sizeof(lua_Debug)*count);
	TraceEntry &te = _traces[hash];
	te.alloc_count += 1;
	te.alloc_bytes += (new_size - old_size);
	if (te.trace.empty())
		lua_debugger::stack_dump_to_string(L, te.trace);
}

In my experience, spending a few hours on fixing the worst hot spots indicated by the trace can reduce the garbage collection time by an order of magnitude.

Sunday, June 26, 2011

Lightweight Lua Bindings

A scripting language, such as Lua, can bring huge productivity gains to a game project. Quick iterations, immediate code reloads and an in-game console with a read-eval-print-loop are invaluable tools. A less obvious benefit is that introducing a scripting language creates a clear dividing line between "engine" and "gameplay" code with a well defined API between them. This is often good for the structure of the engine, at least if you intend to use it for more than one game.

The main drawback is of course performance. It is a scary thing to discover late in a project that the game is slow because the script is doing too much. Especially since bad script performance cannot always be traced back to bugs or bad algorithms. Sure, you get those as well, but you can also get problems with "overall slowness" that cannot easily be traced back to specific bottlenecks or hot spots. There are two reasons for this. First, the slowness of script code compared to C, which means that everything just takes more time. And second, the fact that gameplay code tends to be "connection" rather than "compute" heavy which means there is less to gain from algorithmic improvements.

Part of this is a management issue. It is important to monitor the script performance (on the slowest target platform) throughout the production so that measures can be taken early if it looks like it will become a problem. But in this article I will focus on the technical aspects, specifically the C-to-Lua bindings.

It is important to note that when I am talking about performance in this article I mean performance on current generation consoles, because that is where performance problems occur. PC processors are much more powerful (especially when running virtual machines, which tend to be brutal to the cache). The extra cores on the consoles don't help much with script execution (since scripts are connection heavy, they are hard to multithread). And the PC can run LuaJIT which changes the game completely.

This may of course change in future generation consoles. If anyone from Sony or Microsoft is reading this, please add support for JITting to your next generation ventures.

Lua bindings

Apart from optimizing the Lua interpreter itself, optimizing the bindings between Lua and C is the best way of achieving a general performance improvement, since the bindings are used whenever Lua calls some function in the C code which in a typical game happens constantly.

The standard way of binding an object on the C side to Lua is to use a full userdata object. This is a heap allocated data blob with an associated metatable that can be used to store the methods of the object. This allows the user to make a call like:

game_world:get_camera():set_position(Vector3(0,0,0))

In many ways, this is the easiest and most convenient way of using objects in Lua, but it comes with several performance problems:

Any time an object is passed from C to Lua, such as the camera in get_camera()
or the vector created by Vector3(0,0,0), memory for the object must be allocated on the heap. This can be costly.
All the heap objects must be garbage collected by Lua. Calls such as get_camera() create temporary objects that must be collected at some later time. The more garbage we create, the more time we need to spend in garbage collection.
Making use of many heap allocated objects can lead to bad cache performance. When the C side wants to use an object from Lua, it must first fetch it from Lua's heap, then (in most cases) extract an object pointer from its data and look up the object in the game heap. So each time there is an extra cache miss.
The colon method call syntax world:get_camera() actually translates to something like (I've simplified this a bit, see the Lua documentation for details) world._meta_table["get_camera"](world). I.e., it creates an extra table lookup operation for every call.

We can get rid of the first two issues by caching the Lua objects. I.e. instead of creating a new Lua object every time get_camera() is called, we keep a reference to the object on the Lua side and just look it up and return it every time it is requested. But this has other disadvantages. Managing the cache can be tricky and it creates a lot more objects in the Lua heap, since the heap will now hold every object that has ever been touched by Lua. This makes garbage collection take longer and the heap can grow uncontrollably during the play of a level, depending on which objects the player interacts with. Also, this doesn't solve the issue with objects that are truly temporary, such as Vector3(0,0,0).

A better option is to use what Lua calls light userdata. A light userdata is essentially just a C pointer stored in Lua, with no additional information. It lives on the Lua stack (i.e. not the heap), does not require any memory allocations, does not participate in garbage collection and does not have an associated metatable. This addresses all our performance problems, but introduces new (not performance-related) issues:

Since the objects don't have metatables we cannot use the convenient colon syntax for calling their methods.
Light user data objects do not carry any type information, they are just raw pointers. So on the C side we have no way of telling if we have been called with an object of the right type.
Lifetime management is trickier since objects do not have destructors and are not garbage collected. How do we manage dangling pointers in Lua?

Colon syntax

With light user data we cannot use the colon syntax to look up methods. Instead we must call global functions and pass in the objects as parameters. But we can still make sure to organize our methods nicely, i.e., put all the functions that operate on World objects in a table called World. It might then look something like this:

Camera.set_position(World.get_camera(game_world), Vector3(0,0,0))

If you are used to the object oriented style this way of writing can feel awkward at first. But in my experience you get accustomed to it quite quickly. It does have some implications which are not purely syntactical though. On the plus side, this style of writing makes it easy to cache the method lookups for better performance:

local camera_set_position = Camera.set_position
local world_get_camera = World.get_camera

camera_set_position(world_get_camera(game_world), Vector3(0,0,0))

This transformation is so simple that you can easily write a script that performs it on your entire code base.

The main drawback is that we are no longer doing dynamic method lookup, we are calling one specific C method. So we can't do virtual inheritance with method overrides. To me that is not a big problem because firstly, I think inheritance is vastly overrated as a design concept, and secondly, if you really need virtual calls you can always do the virtual method resolution on the C side and get the benefits while still having a static call in Lua.

Type checking

For full userdata we can check the type by looking at the metatable. The Lua library function luaL_checkudata provides this service. Since light userdata is just a raw pointer to Lua, no corresponding functionality is offered. So we need to provide the type checking ourselves. But how can we know the type of an arbitrary C pointer?

An important thing to notice is that type checking is only used for debugging. We only need to know if a function has been called with the right arguments or not. So we don't actually need to know the exact type of the pointer, we just need to know if it points to the thing we expect. And since this is only used for bug detection, it doesn't matter if we get a few false positives. And it is fine if the test takes a few cycles since we can strip it from our release builds.

Since we just need to know "is the object of this type" we can make test different for each type. So for each type, we can just pick whatever test fits that type best. Some possibilities are:

Store a known four byte type marker at the start of the object's memory. To verify the type, just dereference the pointer and check that the first four bytes match the expected marker. (This is the method I use most frequently.)
Keep a hash table of all objects of the specified type and check if it is there.
For objects that are allocated from a pool, check that the pointer lies within the range of the pool.

Object lifetimes

There are two approaches you can take to ownership of objects in the Lua interface. They can either be Lua owned and destroyed by the garbage collector or they can be owned by the C side and destroyed by explicit function calls. Both approaches have their advantages, but I usually lean towards the latter one. To me it feels more natural that Lua explicitly creates and destroys cameras with World.destroy_camera() rather than cameras just popping out of existence when the garbage collector feels they are no longer used. Also, since in our engine, Lua is an option, not a requirement, it makes more sense to have the ownership on the C side.

With this approach you have the problem that Lua can hold "dangling pointers" to C objects, which can lead to nasty bugs. (If you took the other approach, you would have the opposite problem, which is equally nasty.)

Again, for debugging purposes, we would want to do something similar to what we did with the type information. We would like to know, in debug builds, if the programmer has passed us a pointer to a dead object, so that we can display an error message rather than exhibit undefined behavior.

This is a trickier issue and I haven't found a clear cut solution, but here are some of the techniques I have used:

Clear out the marker field of the object when it is freed. That way if you attempt to use it later you will get a type error. Of course, checking this can cause an access violation if the memory has been returned to the system.
For objects that get created and destroyed a lot, such as particles or sound instances, let Lua manage them by IDs rather than by raw pointers.
Keep a hash table of all known live objects of the type.
Let Lua point to the object indirectly through a handle. Use some bits of the pointer to locate the handle and match the rest to a counter in the handle so that you can detect if the handle has been released and repurposed for something else.

Conclusions

Using light instead of full userdata does make things more inconvenient. But as we have seen, there are tricks that help overcome many of these inconveniences.

We still haven't looked at truly the temporary objects, such as Vector3(0,0,0). In my next article I will discuss what can be done about them.

(This has also been posted to the BitSquid blog.)

Friday, June 10, 2011

Strings Redux

Simpler programs are better programs. Today's target: strings. In this post I will show you three ways of improving your code by simplifying your strings.

1. Use UTF-8 everywhere

When I issue programming tests I always have some question about different string encodings. It is a good way of testing if a candidate can distinguish what data represents from how it is represented. But when I write code I just use UTF-8 everywhere, both in memory and on disk. Why? UTF-8 has many advantages and no serious disadvantages.

Advantages:

Using the same encoding everywhere means there is never any confusion about what encoding a certain string or file should be in. If it is not in UTF-8, then it is wrong. Period.
UTF-8 uses the standard C data types for strings: char * and char [].
ASCII strings look like ASCII strings and all functions, parsers, etc that operate on ASCII strings work on UTF-8 strings without modification.

The most common disadvantages claimed for UTF-8 are:

UTF-8 can waste memory.
Finding the i’th glyph in a UTF-8 string is expensive (O(n) rather than O(1)).

There is some truth to the first point. Yes, if your text is in Japanese, UTF-8 probably uses more memory than Shift-JIS. But I don’t think that is a major issue. First, while UTF-8 is worse than other encodings for some languages, it does pretty well on average. Second, strings aren’t a big part of a game’s memory usage anyway (if they are, you are most likely doing something wrong). And third, if you care that much about string memory usage you should probably compress your string data.

Compression will pretty much nullify any differences in memory usage caused by using different encodings, since the entropy of the underlying data is the same regardless of how it is encoded. (At least in theory, it would be interesting to see someone test it in practice.)

The second point is true but also moot, since accessing glyphs at random indices in a string is a much rarer operation than you might think. For most string operations: concatenation, parsing, etc you never have to access individual glyphs. You can just use the same implementation as you would use for an ASCII-string and it will work without modification.

In the few cases where you do need to convert to glyphs (for example for rendering) you typically do that sequentially, from the start to the end. This is still a fast operation, it is only random access of glyphs that is significantly slower with UTF-8 than with UTF-32. Another interesting thing to note is that since all continuation bytes in UTF-8 follow the pattern 10xxxxxx you can quickly find the start and end of the next or previous glyph given a char * to anywhere within a UTF-8 string.

In fact I can't think of any string operation that requires fast random access to glyphs other than completely contrived examples (given 10000 long strings, find the 1000th glyph in each). I urge my readers to try to come up with something.

2. You do not need a string class

String classes are highly overrated.

Generally speaking, code that deals with strings can be divided into two categories: code that looks at static strings (parsers, data compilers, script callbacks, etc) and code that builds dynamic strings (template formatters, debug logging, etc). In a typical game project there is a lot more of the first than the latter. Ironically, string classes don’t do a very good job with either!

For code that deals with static strings you should always use const char * rather than const string &. The former is more flexible. It allows the caller to store her strings however she likes rather than adhering to some memory model imposed by the string class. It also means that if you call the function with a static string it doesn’t get pointlessly converted to a string object.

But string classes aren’t very good for dynamic strings either, as anyone who has written something like this can attest to:

string a;
for (i = 0; i<10000; ++i)
    a += "xxx";

Depending on how your string class is implemented this can be horribly inefficient, reallocating and copying the string memory for every iteration of the loop. There are various ways of addressing this: reserving memory for the string up front or using some kind of "rope" or "stringstream" class.

The simpler approach is to just use:

vector<char> a;
for (i=0; i<10000; ++i)
 string::append(a, "xxx");

We represent the string as a vector of chars and provide a library of functions for performing "common string operations" on that representation.

The advantage of this over using a regular string class is that it provides a clear distinction between strings that can grow (vector<char>) and strings that can't (char *) and emphasizes what the cost of growing is (amortized linear time). Do you know the cost of growing in your string class?

3. You should almost never use strings in your runtime

The variable length nature of strings make them slow, memory consuming and unwieldy (memory for them must be allocated and freed). If you use fixed length strings you will either use even more memory or annoy the content creators because they can't make their resource names as descriptive as they would like too.

For these reasons I think that strings in the runtime should be reserved for two purposes:

User interface text
Debugging

 In particular, you shouldn't use strings for object/resource/parameter names in the runtime. Instead use string hashes. This lets you use user friendly names (strings) in your tools and fast ints in your runtime. It is also a lot easier to use than enums. Enums require global cooperation to avoid collisions. String hashes just require that you hash into a large enough key space.

We hash names during our data compile stage into either 32-bit or 64-bit ints depending on the risk of collision. If it is a global object name (such as the name of a texture) we use 64-bit ints. If it is a local name (such as the name of a bone in a character) we use 32-bit ints. Hash collision is considered a compile error. (It hasn't happened yet.)

Since user interface text should always be localized, all user interface strings are managed by the localizer. The localized text is fetched from the localizer with a string lookup key, such as "menu_file_open" (hashed to a 64-bit int of course).

This only leaves debugging. We use formatted strings for informative assert messages when something goes wrong. Our profiler and monitoring tools use interned strings to identify data. Our game programmers use debug-prints to root out problems. Of course, non of this affects the end user, since the debugging strings are only used in debug builds.

Hashes can be problematic when debugging. If there is an error in the resource 0x3e728af10245bc71 it is not immediately obvious that it is the object vegetation/trees/larch_3.mesh that is at fault.

We handle this with a lookup table. When we compile our data we also create a reverse lookup table that converts from a hash value back to the original string that generated it. This table is not loaded by the runtime, but it can be accessed by our tools. So our game console, for instance, uses this table to automatically translate any hash IDs that are printed by the game.

However, recently I've started to also add small fixed-size debug strings to the resources themselves. Something like this:

HashMap<IdString64, MeshResource *> _meshes;

struct MeshResource
{
 char debug_name[32];
 …
};

As you can see, all the lookup tables etc, still use the 64-bit hash to identify the resource. But inside the resource is a 32-byte human friendly name (typically, the last 32 characters of the resource name), which is only used for debugging. This doesn't add much to the resource size (most resources are a lot bigger than 32 bytes) but it allows us to quickly identify a resource in the debugger or in a raw memory dump without having to open up a tool to convert hashes back to strings. I think the time saved by this is worth those extra bytes.

Thursday, May 26, 2011

Monitoring your game

Many bugs are easy to fix with debuggers, stack traces and printf-statements. But some are hard to even see with such tools. I'm thinking of things like frame rate hitches, animation glitches and camera stutters. You can't put a breakpoint on the glitch because what constitutes a glitch is only defined in relation to what happened in the frame before or what will happen in the next frame. And even if you are able to break exactly when the glitch occurs, you might not be able to tell what is going on from the call stack.

In these situations, some way of monitoring and visualizing your game's behavior can be invaluable. Indeed, if we graph the delta time for each frame, the hitches stand out clear as day.

Delta-time graph with frame rate drops.

A graph like this opens up many new ways of attacking glitch bugs. You can play the game with the graph displayed and try to see what game actions trigger the glitches. Do they happen when a certain enemy is spawned? When a particular weapon is fired? Another approach is to draw the total frame time together with the time spent in all the different subsystems. This immediately shows you which subsystem is causing the frame rate to spike. You can constrain the problem further by graphing the time spent in narrower and narrower profiler scopes.

Visualization tools like these can help with many other issues as well. Want to find out where a weird camera stutter comes from? Plot the camera position, the position of its look-at target and any other variables that may influence its behavior to pin down the source of the problem. Draw a graph representing your memory fragmentation to find problematic allocations and get an overall feeling for how bad the situation is. Does something look slightly off with the animations? Graph the bone rotations to make sure that you don't have any vibrations or discontinuities. Graph your network usage to make sure you stay below the bandwidth cap.

Rotation of a bone during a jump animation.

When you study your game in this way, you will most likely learn things that surprise you. Games are highly complex systems built by a large number of people over a long period of time. As all complex systems they show emergent behavior. You can be quite certain that at least someone has done at least done something that is completely unexpected and totally weird. You can't hope to discover these things using just a bottom-up approach. There is too much code and too much data. Instead you must study your game as if it was an alien organism. Prod it and see how it reacts. Keep the graphs on screen and make sure that they look sane.

There are many different kinds of data that can be interesting and many ways of visualizing them - graphs, bars, charts, etc. But in all cases the pattern is pretty much the same. We have some data that we record from the game and then we have a visualizer that takes this data and draws it in some interesting way. Schematically, we can represent it like this:

Basic monitoring system schematic.

I will refine this picture shortly, but first lets do a little data-oriented design and ask ourselves how we can best store and process this data.

If you have read any of my earlier blog posts you will know that I'm a fan of big dumb continuous memory buffers and data structures that look like "file formats for memory". And this approach works perfectly for this problem. We can just store the data as a big block of concatenated structs, where each struct represents some recorded data. We begin each record with an enum specifying the type of recorded event and follow that with a variable sized struct with data for that particular event.

Data buffer layout.

The event types might be things such as ENTER_PROFILER_SCOPE, LEAVE_PROFILER_SCOPE, ALLOCATE_MEMORY, FREE_MEMORY, RECORD_GLOBAL_FLOAT, etc.

RECORD_GLOBAL_FLOAT is the event type used for all kinds of data that we want to draw in graphs. We record the data with calls like these:

record_global_float("application.delta_time", dt);
record_global_float("application.frame_rate", 1.0f / dt);

The corresponding data struct is just:

struct RecordGlobalFloatEvent {
    const char *name;
    float value;
};

Note that there is an interesting little trick being used here. When we record the events, we just record the string pointers, not the complete string data. This saves memory, makes the struct fixed size and gives us faster string compares. This works because record_global_float() is called with static string data that is always at the same address and kept in memory throughout the lifetime of the application. (In the rare case where you want to call record_global_float() with a dynamic string, you must allocate a copy of that string at some permanent location, i.e. do a form of string interning.)

Now, let's refine the picture slightly. There is a problem with recording all data to a single memory buffer and that is multithreading. If all threads record their data to the same memory buffer then we need lots of mutex locking to make sure they don't step on each other's toes.

We might also want to add support for some kind of off-line (i.e., not in-game) visualization. Off-line visualizers can take advantage of the full power of your development PC to implement more powerful visualization algorithms. And since they have near unlimited memory, they can record the entire data history so that you can explore it back and forth after the game session has ended.

With these refinements our monitoring system now looks like this:

Advanced monitoring system schematic.

Each thread has a small TLS (thread-local-storage) cache with 64 K or so of debug memory where it records its events. When the cache gets full or we reach the end of the frame, the thread acquires the lock to the global event buffer and flushes its data there.

The active on-line visualizers process the events in the buffer and visualize them. Simulatenously, we send the data over TCP so that it can be processed by any off-line visualizers. In the process we consume the buffer data and the buffer can be filled with new data from the threads.

(We allocate all the buffers we use on a special debug heap, so that we separate the allocations which we only do for debugging purposes from the allocations done by the main game.)

Recording float data requires just a few lines of code.

enum RECORD_GLOBAL_FLOAT_EVENT = 17;
enum THREAD_BUFFER_SIZE = 64*1024;
__thread char *_thread_buffer;
__thread unsigned _thread_buffer_count;

inline void record_global_float(const char *name, float value)
{
     if (_thread_buffer_count + 12 > THREAD_BUFFER_SIZE)
         flush_thread_buffer();
     
     char *p = _thread_buffer + _thread_buffer_count
     *(unsigned *)p = GLOBAL_FLOAT;
     *(RecordGlobalFloatEvent *)(p+4).name = name;
     *(RecordGlobalFloatEvent *)(p+4).value = value;
    thread_buffer_count += 12;
}

When you have the data, writing the graph visualizer is not much work. Just save the data over a couple of frames and plot it using a line drawer.

In the BitSquid engine, we also expose all the data recording functions to Lua scripting. This makes it possible to dynamically create graphs for all kinds of data while the game is running.

As an example of this, a couple of days ago a game programmer suspected that some problematic behavior was caused by a low update frequency in the mouse driver. We quickly bashed out a couple of lines in the game console to produce a graph of the mouse data and could immediately confirm that this indeed was the case:

Core.Debug.add_updator(
  function ()
    Profiler.record_statistics("mouse", Mouse.axis(0))
  end 
)
graph make mousegraph
graph add_vector3 mousegraph mouse
graph range mousegraph -20 20

Graph of mouse input showing frames with no input.

Friday, May 6, 2011

Flow -- Data-Oriented Implementation of a Visual Scripting Language

Presentation made at Stockholm Game Developer Forum, 5 May 2011:

Download:

View online fullscreen:

at iwork.com