the crowdstrike incident: let do a deep dive into kernel

-> bug in kernel-level driver led to memory corruption


kernel-level operations

a simplified example of how a kernel driver might interact with system calls:

#include <ntddk.h>

NTSTATUS DriverEntry(PDRIVER_OBJECT DriverObject, PUNICODE_STRING RegistryPath) {
    // initialization
    DriverObject->MajorFunction[IRP_MJ_CREATE] = CreateDispatch;
    DriverObject->MajorFunction[IRP_MJ_CLOSE] = CloseDispatch;
    DriverObject->MajorFunction[IRP_MJ_DEVICE_CONTROL] = DeviceControlDispatch;

    return STATUS_SUCCESS;
}

NTSTATUS DeviceControlDispatch(PDEVICE_OBJECT DeviceObject, PIRP Irp) {
    PIO_STACK_LOCATION irpSp = IoGetCurrentIrpStackLocation(Irp);
    ULONG ioControlCode = irpSp->Parameters.DeviceIoControl.IoControlCode;

    switch (ioControlCode) {
        case IOCTL_CUSTOM_OPERATION:
            // perform custom operation
            break;
        default:
            // handle unknown IO control code
            break;
    }

    // complete the IRP
    Irp->IoStatus.Status = STATUS_SUCCESS;
    IoCompleteRequest(Irp, IO_NO_INCREMENT);
    return STATUS_SUCCESS;
}

in this example, a bug in the devicecontroldispatch function could lead to system-wide issues, as it’s operating in kernel space.


alternative approaches

creating a custom secure kernel

here’s a minimal example of what the entry point of a custom kernel might look like:

void kernel_main() {
    // initialize hardware
    init_gdt();
    init_idt();
    init_paging();

    // initialize kernel subsystems
    init_memory_manager();
    init_process_manager();
    init_file_system();

    // start the first user-space process
    start_init_process();

    // enter the idle loop
    while(1) {
        asm("hlt");
    }
}

this requires expertise and ongoing maintenance.

operating without a kernel

conceptual example of a “kernelless” system in assembly:

section .text
global _start

_start:
    ; Direct hardware initialization
    ; Set up stack
    mov esp, stack_top

    ; Initialize essential hardware
    call init_video
    call init_keyboard

    ; Jump to main application code
    call main_app

    ; Halt the CPU
    hlt

init_video:
    ; Direct manipulation of video hardware
    ret

init_keyboard:
    ; Direct manipulation of keyboard controller
    ret

main_app:
    ; Your main application logic
    ret

section .bss
stack_bottom:
    resb 16384 ; 16 KB for stack
stack_top:

this allows for maximum control but sacrifices the abstractions and hardware management.

improving existing kernel-level drivers

-rigorous error checking:

NTSTATUS SafeOperation(PVOID buffer, SIZE_T bufferSize) {
    if (buffer == NULL || bufferSize == 0) {
        return STATUS_INVALID_PARAMETER;
    }

    // validate user-mode buffer
    if (!IsBufferAccessible(buffer, bufferSize, TRUE)) {
        return STATUS_ACCESS_VIOLATION;
    }

    // perform operation
    return STATUS_SUCCESS;
}

-using memory management functions:

PVOID SafeAllocateMemory(SIZE_T size) {
    PVOID buffer = ExAllocatePoolWithTag(NonPagedPool, size, 'tag1');
    if (buffer) {
        RtlZeroMemory(buffer, size);
    }
    return buffer;
}

VOID SafeFreeMemory(PVOID buffer) {
    if (buffer) {
        ExFreePoolWithTag(buffer, 'Tag1');
    }
}

today’s mood board:

  • caffeinated
  • debugged
  • ready to code
  • ready to sleep

til (today i learned)

  1. perceptron -> they’re cool
  2. cuda world (below)
#include <stdio.h>

__global__
void saxpy(int n, float a, float *x, float *y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  if (i < n) y[i] = a*x[i] + y[i];
}

int main(void)
{
  int N = 1<<20;
  float *x, *y, *d_x, *d_y;
  x = (float*)malloc(N*sizeof(float));
  y = (float*)malloc(N*sizeof(float));

  cudaMalloc(&d_x, N*sizeof(float)); 
  cudaMalloc(&d_y, N*sizeof(float));

  for (int i = 0; i < N; i++) {
    x[i] = 1.0f;
    y[i] = 2.0f;
  }

  cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);

  // Perform saxpy on 1m elements
  saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

  cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);

  float maxError = 0.0f;
  for (int i = 0; i < N; i++)
    maxError = max(maxError, abs(y[i]-4.0f));
  printf("Max error: %f\n", maxError);

  cudaFree(d_x);
  cudaFree(d_y);
  free(x);
  free(y);
}

random thought

don’t deploy on friday!