Could you help me with reinforcement learning?

jaroslav.tavgen · Post by **jaroslav.tavgen** » Thu Nov 06, 2025 2:39 pm

Hi!

I am trying to learn how neural networks work, and as a proof of concept I want to make a program that plays itself 4x4 tic-tac-toe and learns to master the game using neural network.

The concept is as follows:

The program endlessly plays itself. When it is a Crosses turn to move the program calculates the best move using neural network. The inputs are states of the board (double board[16]) after the move is made by crosses. Square with "0" equals -1, empty square equals 0 square with "X" equals 1.

After the game was finished each state of the board after the crosses move is saved into the "inputs" array (no duplicates). If the game was won by crosses then the value for the corresponding position in the "victories" array is increased. the value for the corresponding position in the "ties" array is increased. In any case the value for the corresponding position in the "played" array is increased.

The output value for the each position (after the move of crosses) in the game is calculated as follows: "(number of victories after that move + number of ties after that move) divided by total number of games played after that move". The idea behind this that as more games are played those outputs become more accurate.

And one more thing: the game does not add new positions unless the neural network is trained in such a way that the Mean Squared Error is below 0.01. Only after this is achieved new positions (aka new inputs) are added. This is made to make sure that the neural network is adequately learning.

The network is trained every time new position(s) were added OR after 10,000 games were played.

The neural network: 16 input leurons, 1 hidden layer, 128 hidden neurons, 1 output layer, activation function is Sigmoid.

THE PROBLEM: this neural network DOES learn. But it learns slowly. Very slowly. What can I do to improve it's speed?

Code: Select all

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

#define MAX_POSITIONS 10000
#define HIDDEN_NEURONS 128
#define LEARNING_RATE 0.01
#define EPOCHS 100000

// Sigmoid activation function
double sigmoid(double x) {
    return 1.0 / (1.0 + exp(-x));
}

// Derivative of sigmoid
double sigmoid_derivative(double x) {
    return x * (1.0 - x);
}

// Random weight initialization
double random_weight() {
    return ((double)rand() / RAND_MAX) * 2.0 - 1.0;
}


int check_board(int * board, int X, int three) {
	//printf ( "\nX = %d three = %d\n", X , three);
  for (int row = 0; row < three; row++) {
    int won = 1;
    for (int column = 0; column < three; column++)
    {
	  //printf( "board[row * three + column] = %d\n", board[row * three + column]);
      if (board[row * three + column] != X)
      {
        won = 0;
        break;
      }
    }
	if (won) return 1;
  }
  for (int column = 0; column < three; column++)
  {
    int won = 1;
    for (int row = 0; row < three; row++)
    {
      if (board[row * three + column] != X)
      {
        won = 0;
        break;
      }
    }
    if (won) return 1;
  }
  int column = 0;
  int row = 0;
  int won = 1;
  while (column < three) {
    if (board[row * three + column] != X)
    {
      won = 0;
      break;
    }
    column++;
    row++;
  }
  if (won) return 1;
  column = three - 1;
  row = 0;
  won = 1;
  while (row < three) {
    if (board[row * three + column] != X)     {
      won = 0;
      break;
    }
    column--;
    row++;
  }
  if (won) return 1;
  return 0;
}

void printBoard ( int * board, int boardSideLength ) {
	printf("\n");
	for ( int row = 0; row < boardSideLength; row++ ) {
		printf("\n");
		for ( int column = 0; column < boardSideLength; column++ ) {
			if ( board [ row * boardSideLength + column ] == 0 ) printf("*");
			if ( board [ row * boardSideLength + column ] == -1 ) printf("0");
			if ( board [ row * boardSideLength + column ] == 1 ) printf("X");
		}
	}
}


int main (int argc, char **argv) {
	int boardSideLength = 4;
	int boardLength = boardSideLength * boardSideLength;
	unsigned long long zobristTable[boardLength][2];
	srand(12345);
	for (int i = 0; i < 16; i++) {
		for (int j = 0; j < 2; j++) {
			zobristTable[i][j] = ((unsigned long long)rand() << 32) | rand();
		}
	}
	srand(time(NULL));
	int numberOfInputNeurons = boardLength;

	double bias_hidden[HIDDEN_NEURONS];
	double bias_output;
	int defeats = 0;
	int draws = 0;

	double ties[MAX_POSITIONS];
	double expected_outputs[MAX_POSITIONS];
	unsigned long long hashes[MAX_POSITIONS];
	double inputs[MAX_POSITIONS][numberOfInputNeurons];
	double played[MAX_POSITIONS];
	double victories[MAX_POSITIONS];
	for ( int i = 0; i < MAX_POSITIONS; i++ ) {
		expected_outputs [ i ] = 0;
		hashes [ i ] = 0;
		played [ i ] = 0;
		ties [ i ] = 0;
		victories [ i ] = 0;
	}

	int numberOfInputs = 0;
	double lastError = 0;

	int foundNewPosition;
	int gameIsLasting = 1;
	int games = 0;
    double weights_hidden_output[HIDDEN_NEURONS];
	double weights_input_hidden[numberOfInputNeurons][HIDDEN_NEURONS];
	int wins = 0;

	// Generate random weights
    for (int inputNeuron = 0; inputNeuron < numberOfInputNeurons; inputNeuron++) {
        for (int hiddenNeuron = 0; hiddenNeuron < HIDDEN_NEURONS; hiddenNeuron++) {
            weights_input_hidden[inputNeuron][hiddenNeuron] = random_weight();
        }
    }
	for (int hiddenNeuron = 0; hiddenNeuron < HIDDEN_NEURONS; hiddenNeuron++) {
	    weights_hidden_output[hiddenNeuron] = random_weight();
        bias_hidden[hiddenNeuron] = random_weight();
	}
    bias_output = random_weight();

	// Endlessly play and train
	while ( 1 ) {
			// Prepare everything for the game
		int board[boardLength];
		for ( int i = 0; i < boardLength; i++ ) board [ i ] = 0;
		unsigned long long hash = 0;;
		double history [ boardLength ][boardLength];
		unsigned long long historyHash [ boardLength ];
		int historyLength = 0;
		int outcome = 0;
		int whoseTurn = -1;
		// Play a game
		while ( gameIsLasting ) {
			int bestMove = -1;
			int indices[ boardLength ];
			int indicesLength = 0;
			// Change sides
			whoseTurn = -whoseTurn;
			// Get all Moves
			for ( int i = 0; i < boardLength; i++ ) {
				if ( board [ i ] == 0 ) indices [ indicesLength++ ] = i;
			}
			if ( indicesLength == 0 ) break;
			// Find the best move either randompy or by appying the neural network
			if ( whoseTurn == -1 || ( rand()%10 == 0 ) ) {
				bestMove = indices [ rand() % indicesLength ];
			}
			else {
				double biggestOutput = -1;
				for ( int i = 0; i < indicesLength; i++ ) {
					board [ indices [ i ] ] = whoseTurn;
					double hidden[HIDDEN_NEURONS];
					double output = bias_output;
					for (int hiddenNeuron = 0; hiddenNeuron < HIDDEN_NEURONS; hiddenNeuron++) {
						hidden[hiddenNeuron] = bias_hidden[hiddenNeuron];
						for ( int inputNeuron = 0; inputNeuron < numberOfInputNeurons; inputNeuron++ ) {
							hidden[hiddenNeuron] += board[inputNeuron] *weights_input_hidden[inputNeuron][hiddenNeuron];
						}
						hidden[hiddenNeuron] = sigmoid ( hidden[hiddenNeuron] );
						output += hidden[hiddenNeuron] * weights_hidden_output[hiddenNeuron];
					}
					output = sigmoid(output);
					if ( output > biggestOutput ) {
						biggestOutput = output;
						bestMove = indices [ i ];
					}
					board [ indices [ i ] ] = 0;
				}
			}
			// Make a move
			board [ bestMove ] = whoseTurn;
			hash ^= zobristTable[bestMove][whoseTurn == 1 ? 0 : 1];
			// If crosses then record the position
			if ( whoseTurn == 1 ) {
				historyHash [ historyLength ] = hash;
				for ( int i = 0; i < boardLength; i++ ) history [ historyLength ] [ i ] = board [ i ];
				historyLength++;
			}
			// If the player has won terminate the game
			if ( check_board(board, whoseTurn, boardSideLength) ) {
				outcome = whoseTurn;
				break;
			}
		}
		games++;
		if ( outcome == -1 ) defeats++;
		if ( outcome == 0 ) draws++;
		if ( outcome == 1 ) wins++;
		// If 10,000 games were played then write the statistics and train the neural network
		if ( ( games % 10000 ) == 0 ) {
			printf("wins = %d draws = %d losses = %d numberOfInputs = %d\n", wins, draws, defeats, numberOfInputs );
			wins = 0;
			draws = 0;
			defeats = 0;
			goto here;
		}
		// Check the history of the game and update the statistics on each move made
		foundNewPosition = 0;
		for ( int i = 0; i < historyLength; i++ ) {
			int found = 0;
			for ( int j = 0; j < numberOfInputs; j++ ) {
				if ( hashes [ j ] == historyHash [ i ] ) {
					played [ j ] += 1;
					if ( outcome == 0 ) ties [ j ] += 1;
					if ( outcome == 1 ) victories [ j ] += 1;
					expected_outputs [ j ] = (victories [ j ] + 0.5 * ties[ j ]) / played [ j ];
					found = 1;
					break;
				}
			}
			// If the position is new then record it and set the statistics
			if ( !found && ( lastError < 0.01 ) ) {
				for ( int l = 0; l < boardLength; l++ ) {
					inputs [ numberOfInputs ] [ l ] = history [ i ] [ l ];
				}

				hashes [ numberOfInputs ] = historyHash[i];
				played [ numberOfInputs ] = 1;
				if ( outcome == 0 ) ties [ numberOfInputs ] = 1;
				if ( outcome == 1 ) victories [ numberOfInputs ] = 1;
				expected_outputs [ numberOfInputs ] = (victories [ numberOfInputs ] + 0.5 * ties[ numberOfInputs ]) / played [ numberOfInputs ];
				numberOfInputs++;
				foundNewPosition = 1;
			}
		}
		// If the new position is added or the last error was acceptable or 10,000 games were played or it is the first game then train the neural network
		if ( foundNewPosition && (lastError < 0.01) ) {
			here:
			double previousError = 1000000;
			for (int epoch = 0; epoch < EPOCHS; epoch++) {
				double total_error = 0.0;

				// Train on each sample
				for (int sample = 0; sample < numberOfInputs; sample++) {
					double target = expected_outputs[sample];

					// Forward pass
					// Hidden layer
					double hidden[HIDDEN_NEURONS];
					double output = bias_output;
					for (int hiddenNeuron = 0; hiddenNeuron < HIDDEN_NEURONS; hiddenNeuron++) {
						hidden[hiddenNeuron] = bias_hidden[hiddenNeuron];
						for ( int inputNeuron = 0; inputNeuron < numberOfInputNeurons; inputNeuron++ ) {
							hidden[hiddenNeuron] += inputs[sample][inputNeuron] *weights_input_hidden[inputNeuron][hiddenNeuron];
						}
						hidden[hiddenNeuron] = sigmoid ( hidden[hiddenNeuron] );
						output += hidden[hiddenNeuron] * weights_hidden_output[hiddenNeuron];
					}

					// Output layer
					output = sigmoid(output);

					// Calculate error
					double error = target - output;
					total_error += error * error;

					// Backward pass
					// Output layer gradients
					double d_output = error * sigmoid_derivative(output);

					// Hidden layer gradients
					double d_hidden[HIDDEN_NEURONS];
					for (int hiddenNeuron = 0; hiddenNeuron < HIDDEN_NEURONS; hiddenNeuron++) {
						d_hidden[hiddenNeuron] = d_output * weights_hidden_output[hiddenNeuron] * sigmoid_derivative(hidden[hiddenNeuron]);
					}

					double lambda = 1e-4;

					// Update output layer weights and bias
					for (int hiddenNeuron = 0; hiddenNeuron < HIDDEN_NEURONS; hiddenNeuron++) {
						weights_hidden_output[hiddenNeuron] += LEARNING_RATE * (d_output * hidden[hiddenNeuron] - lambda * weights_hidden_output[hiddenNeuron]);
					}
					bias_output += LEARNING_RATE * d_output;
					// Update hidden layer weights and biases
					for (int hiddenNeuron = 0; hiddenNeuron < HIDDEN_NEURONS; hiddenNeuron++) {
						for ( int inputNeuron = 0; inputNeuron < numberOfInputNeurons; inputNeuron++ ) {
							weights_input_hidden[inputNeuron][hiddenNeuron] += LEARNING_RATE * (d_hidden[hiddenNeuron] * inputs[sample][inputNeuron] - lambda * weights_input_hidden[inputNeuron][hiddenNeuron]);
						}
						bias_hidden[hiddenNeuron] += LEARNING_RATE * d_hidden[hiddenNeuron];
					}
				}
				double currentError = total_error / numberOfInputs;
				if ( currentError >= previousError ) break;
				previousError = currentError;
			}
			lastError = previousError;
			printf("\ndouble inputs [MAX_POSITIONS][%d] = {", boardLength);
			for ( int l = 0; l < numberOfInputs; l++ ) {
				if ( l > 0 ) printf(" },");
				printf("  { ");
				for ( int o = 0; o < boardLength; o++ ) {
					if ( o > 0 ) printf(", ");
					printf ( "%d", (int)inputs[l][o]);
				}
			}
			printf("} };");
			printf("\ndouble expected_outputs[MAX_POSITIONS]  = {\n");
			for ( int l = 0; l < numberOfInputs; l++ ) {
				if ( l > 0 ) printf(", ");
				printf("%f", expected_outputs [ l ] );
			}
			printf(" };\n");
			printf("\nunsigned long long hashes[MAX_POSITIONS]  = {\n");
			for ( int l = 0; l < numberOfInputs; l++ ) {
				if ( l > 0 ) printf(", ");
				printf("%llu", hashes [ l ] );
			}
			printf(" };\n");
			printf("double played[MAX_POSITIONS]  = {\n");
			for ( int l = 0; l < numberOfInputs; l++ ) {
				if ( l > 0 ) printf(", ");
				printf("%d", (int)played [ l ] );
			}
			printf(" };\n");
			printf("double ties[MAX_POSITIONS]  = {\n");
			for ( int l = 0; l < numberOfInputs; l++ ) {
				if ( l > 0 ) printf(", ");
				printf("%d", (int)ties [ l ] );
			}
			printf(" };\n");
			printf("double victories[MAX_POSITIONS]  = {\n");
			for ( int l = 0; l < numberOfInputs; l++ ) {
				if ( l > 0 ) printf(", ");
				printf("%d", (int)victories [ l ] );
			}
			printf(" };\n");
			printf("int numberOfInputs = %d;\n", numberOfInputs);
			printf("double lastError = %f;\n", lastError);
			printf("\n");
		}
	}
    return 0;
}