/* coherent hypertransport initialization for AMD64 * * written by Stefan Reinauer * (c) 2003-2004 by SuSE Linux AG * * This code is licensed under GPL. */ /* * This algorithm assumes a grid configuration as follows: * * nodes : 1 2 4 6 8 * org. : 1x1 2x1 2x2 2x3 2x4 * */ #include #include #include #include "arch/romcc_io.h" #include "amdk8.h" /* * Until we have a completely dynamic setup we want * to be able to map different cpu graphs. */ #define UP 0x00 #define ACROSS 0x20 #define DOWN 0x40 /* * set some default values. These are used if they are not * differently defined in the motherboard's auto.c file. * See src/mainboard/amd/quartet/auto.c for an example. */ #ifndef CONNECTION_0_1 #define CONNECTION_0_1 ACROSS #endif #ifndef CONNECTION_0_2 #define CONNECTION_0_2 UP #endif #ifndef CONNECTION_1_3 #define CONNECTION_1_3 UP #endif /* when generating a temporary row configuration we * don't want broadcast to be enabled for that node. */ #define generate_temp_row(x...) ((generate_row(x)&(~0x0f0000))|0x010000) #define clear_temp_row(x) fill_row(x,7,DEFAULT) #define enable_bsp_routing() enable_routing(0) #define NODE_HT(x) PCI_DEV(0,24+x,0) #define NODE_MP(x) PCI_DEV(0,24+x,1) #define NODE_MC(x) PCI_DEV(0,24+x,3) #define DEFAULT 0x00010101 /* default row entry */ typedef uint8_t u8; typedef uint32_t u32; typedef int bool; #define TRUE (-1) #define FALSE (0) static void disable_probes(void) { /* disable read/write/fill probes for uniprocessor setup * they don't make sense if only one cpu is available */ /* Hypetransport Transaction Control Register * F0:0x68 * [ 0: 0] Disable read byte probe * 0 = Probes issues * 1 = Probes not issued * [ 1: 1] Disable Read Doubleword probe * 0 = Probes issued * 1 = Probes not issued * [ 2: 2] Disable write byte probes * 0 = Probes issued * 1 = Probes not issued * [ 3: 3] Disable Write Doubleword Probes * 0 = Probes issued * 1 = Probes not issued. * [10:10] Disable Fill Probe * 0 = Probes issued for cache fills * 1 = Probes not issued for cache fills. */ u32 val; print_spew("Disabling read/write/fill probes for UP... "); val=pci_read_config32(NODE_HT(0), 0x68); val |= (1<<10)|(1<<9)|(1<<8)|(1<<4)|(1<<3)|(1<<2)|(1<<1)|(1 << 0); pci_write_config32(NODE_HT(0), 0x68, val); print_spew("done.\r\n"); } static void enable_routing(u8 node) { u32 val; /* HT Initialization Control Register * F0:0x6C * [ 0: 0] Routing Table Disable * 0 = Packets are routed according to routing tables * 1 = Packets are routed according to the default link field * [ 1: 1] Request Disable (BSP should clear this) * 0 = Request packets may be generated * 1 = Request packets may not be generated. * [ 3: 2] Default Link (Read-only) * 00 = LDT0 * 01 = LDT1 * 10 = LDT2 * 11 = CPU on same node * [ 4: 4] Cold Reset * - Scratch bit cleared by a cold reset * [ 5: 5] BIOS Reset Detect * - Scratch bit cleared by a cold reset * [ 6: 6] INIT Detect * - Scratch bit cleared by a warm or cold reset not by an INIT * */ /* Enable routing table */ print_spew("Enabling routing table for node "); print_spew_hex8(node); val=pci_read_config32(NODE_HT(node), 0x6c); val &= ~((1<<1)|(1<<0)); pci_write_config32(NODE_HT(node), 0x6c, val); print_spew(" done.\r\n"); } #if CONFIG_MAX_CPUS > 1 static void rename_temp_node(u8 node) { uint32_t val; print_spew("Renaming current temporary node to "); print_spew_hex8(node); val=pci_read_config32(NODE_HT(7), 0x60); val &= (~7); /* clear low bits. */ val |= node; /* new node */ pci_write_config32(NODE_HT(7), 0x60, val); print_spew(" done.\r\n"); } static bool check_connection(u8 src, u8 dest, u8 link) { /* See if we have a valid connection to dest */ u32 val; /* Detect if the coherent HT link is connected. */ val = pci_read_config32(NODE_HT(src), 0x98+link); if ( (val&0x17) != 0x03) return 0; /* Verify that the coherent hypertransport link is * established and actually working by reading the * remode node's vendor/device id */ val = pci_read_config32(NODE_HT(dest),0); if(val != 0x11001022) return 0; return 1; } static unsigned read_freq_cap(device_t dev, unsigned pos) { /* Handle bugs in valid hypertransport frequency reporting */ unsigned freq_cap; uint32_t id; freq_cap = pci_read_config16(dev, pos); freq_cap &= ~(1 << HT_FREQ_VENDOR); /* Ignore Vendor HT frequencies */ id = pci_read_config32(dev, 0); /* AMD 8131 Errata 48 */ if (id == (PCI_VENDOR_ID_AMD | (PCI_DEVICE_ID_AMD_8131_PCIX << 16))) { freq_cap &= ~(1 << HT_FREQ_800Mhz); } /* AMD 8151 Errata 23 */ if (id == (PCI_VENDOR_ID_AMD | (PCI_DEVICE_ID_AMD_8151_SYSCTRL << 16))) { freq_cap &= ~(1 << HT_FREQ_800Mhz); } /* AMD K8 Unsupported 1Ghz? */ if (id == (PCI_VENDOR_ID_AMD | (0x1100 << 16))) { freq_cap &= ~(1 << HT_FREQ_1000Mhz); } return freq_cap; } static int optimize_connection(device_t node1, uint8_t link1, device_t node2, uint8_t link2) { static const uint8_t link_width_to_pow2[]= { 3, 4, 0, 5, 1, 2, 0, 0 }; static const uint8_t pow2_to_link_width[] = { 0x7, 4, 5, 0, 1, 3 }; uint16_t freq_cap1, freq_cap2, freq_cap, freq_mask; uint8_t width_cap1, width_cap2, width_cap, width, old_width, ln_width1, ln_width2; uint8_t freq, old_freq; int needs_reset; /* Set link width and frequency */ /* Initially assume everything is already optimized and I don't need a reset */ needs_reset = 0; /* Get the frequency capabilities */ freq_cap1 = read_freq_cap(node1, link1 + PCI_HT_CAP_HOST_FREQ_CAP); freq_cap2 = read_freq_cap(node2, link2 + PCI_HT_CAP_HOST_FREQ_CAP); /* Calculate the highest possible frequency */ freq = log2(freq_cap1 & freq_cap2); /* See if I am changing the link freqency */ old_freq = pci_read_config8(node1, link1 + PCI_HT_CAP_HOST_FREQ); needs_reset |= old_freq != freq; old_freq = pci_read_config8(node2, link2 + PCI_HT_CAP_HOST_FREQ); needs_reset |= old_freq != freq; /* Set the Calulcated link frequency */ pci_write_config8(node1, link1 + PCI_HT_CAP_HOST_FREQ, freq); pci_write_config8(node2, link2 + PCI_HT_CAP_HOST_FREQ, freq); /* Get the width capabilities */ width_cap1 = pci_read_config8(node1, link1 + PCI_HT_CAP_HOST_WIDTH); width_cap2 = pci_read_config8(node2, link2 + PCI_HT_CAP_HOST_WIDTH); /* Calculate node1's input width */ ln_width1 = link_width_to_pow2[width_cap1 & 7]; ln_width2 = link_width_to_pow2[(width_cap2 >> 4) & 7]; if (ln_width1 > ln_width2) { ln_width1 = ln_width2; } width = pow2_to_link_width[ln_width1]; /* Calculate node1's output width */ ln_width1 = link_width_to_pow2[(width_cap1 >> 4) & 7]; ln_width2 = link_width_to_pow2[width_cap2 & 7]; if (ln_width1 > ln_width2) { ln_width1 = ln_width2; } width |= pow2_to_link_width[ln_width1] << 4; /* See if I am changing node1's width */ old_width = pci_read_config8(node1, link1 + PCI_HT_CAP_HOST_WIDTH + 1); needs_reset |= old_width != width; /* Set node1's widths */ pci_write_config8(node1, link1 + PCI_HT_CAP_HOST_WIDTH + 1, width); /* Calculate node2's width */ width = ((width & 0x70) >> 4) | ((width & 0x7) << 4); /* See if I am changing node2's width */ old_width = pci_read_config8(node2, link2 + PCI_HT_CAP_HOST_WIDTH + 1); needs_reset |= old_width != width; /* Set node2's widths */ pci_write_config8(node2, link2 + PCI_HT_CAP_HOST_WIDTH + 1, width); return needs_reset; } static void fill_row(u8 node, u8 row, u32 value) { pci_write_config32(NODE_HT(node), 0x40+(row<<2), value); } static void setup_row(u8 source, u8 dest, u8 cpus) { fill_row(source,dest,generate_row(source,dest,cpus)); } static void setup_temp_row(u8 source, u8 dest, u8 cpus) { fill_row(source,7,generate_temp_row(source,dest,cpus)); } static void setup_node(u8 node, u8 cpus) { u8 row; for(row=0; row 2 static void setup_temp_node(u8 node, u8 cpus) { u8 row; for(row=0; row 1 static struct setup_smp_result setup_smp(void) { struct setup_smp_result result; result.cpus = 2; result.needs_reset = 0; print_spew("Enabling SMP settings\r\n"); setup_row(0, 0, result.cpus); /* Setup and check a temporary connection to node 1 */ setup_temp_row(0, 1, result.cpus); if (!check_connection(0, 7, CONNECTION_0_1)) { print_debug("No connection to Node 1.\r\n"); clear_temp_row(0); /* delete temp connection */ setup_uniprocessor(); /* and get up working */ result.cpus = 1; return result; } /* We found 2 nodes so far */ result.needs_reset = optimize_connection(NODE_HT(0), 0x80 + CONNECTION_0_1, NODE_HT(7), 0x80 + CONNECTION_0_1); setup_node(0, result.cpus); /* Node 1 is there. Setup Node 0 correctly */ setup_remote_node(1, result.cpus); /* Setup the routes on the remote node */ rename_temp_node(1); /* Rename Node 7 to Node 1 */ enable_routing(1); /* Enable routing on Node 1 */ clear_temp_row(0); /* delete temporary connection */ #if CONFIG_MAX_CPUS > 2 result.cpus=4; /* Setup and check temporary connection from Node 0 to Node 2 */ setup_temp_row(0,2, result.cpus); if (!check_connection(0, 7, CONNECTION_0_2)) { print_debug("No connection to Node 2.\r\n"); clear_temp_row(0); /* delete temp connection */ result.cpus = 2; return result; } /* We found 3 nodes so far. Now setup a temporary * connection from node 0 to node 3 via node 1 */ setup_temp_row(0,1, result.cpus); /* temp. link between nodes 0 and 1 */ setup_temp_row(1,3, result.cpus); /* temp. link between nodes 1 and 3 */ if (!check_connection(1, 7, CONNECTION_1_3)) { print_debug("No connection to Node 3.\r\n"); clear_temp_row(0); /* delete temp connection */ clear_temp_row(1); /* delete temp connection */ result.cpus = 2; return result; } #warning "FIXME optimize the physical connections" /* We found 4 nodes so far. Now setup all nodes for 4p */ setup_node(0, result.cpus); /* The first 2 nodes are configured */ setup_node(1, result.cpus); /* already. Just configure them for 4p */ setup_temp_row(0,2, result.cpus); setup_temp_node(2, result.cpus); rename_temp_node(2); enable_routing(2); setup_temp_row(0,1, result.cpus); setup_temp_row(1,3, result.cpus); setup_temp_node(3, result.cpus); rename_temp_node(3); enable_routing(3); /* enable routing on node 3 (temp.) */ clear_temp_row(0); clear_temp_row(1); clear_temp_row(2); clear_temp_row(3); #endif print_debug_hex8(result.cpus); print_debug(" nodes initialized.\r\n"); return result; } #endif #if CONFIG_MAX_CPUS > 1 static unsigned verify_mp_capabilities(unsigned cpus) { unsigned node, row, mask; bool mp_cap=TRUE; if (cpus > 2) { mask=0x06; /* BigMPCap */ } else { mask=0x02; /* MPCap */ } for (node=0; node 0; node--) { for (row = cpus; row > 0; row--) { fill_row(NODE_HT(node-1), row-1, DEFAULT); } } setup_uniprocessor(); return 1; } #endif static void coherent_ht_finalize(unsigned cpus) { unsigned node; bool rev_a0; /* set up cpu count and node count and enable Limit * Config Space Range for all available CPUs. * Also clear non coherent hypertransport bus range * registers on Hammer A0 revision. */ #if 0 print_debug("coherent_ht_finalize\r\n"); #endif rev_a0 = is_cpu_rev_a0(); for (node = 0; node < cpus; node++) { device_t dev; uint32_t val; dev = NODE_HT(node); /* Set the Total CPU and Node count in the system */ val = pci_read_config32(dev, 0x60); val &= (~0x000F0070); val |= ((cpus-1)<<16)|((cpus-1)<<4); pci_write_config32(dev, 0x60, val); /* Only respond to real cpu pci configuration cycles * and optimize the HT settings */ val=pci_read_config32(dev, 0x68); val &= ~((HTTC_BUF_REL_PRI_MASK << HTTC_BUF_REL_PRI_SHIFT) | (HTTC_MED_PRI_BYP_CNT_MASK << HTTC_MED_PRI_BYP_CNT_SHIFT) | (HTTC_HI_PRI_BYP_CNT_MASK << HTTC_HI_PRI_BYP_CNT_SHIFT)); val |= HTTC_LIMIT_CLDT_CFG | (HTTC_BUF_REL_PRI_8 << HTTC_BUF_REL_PRI_SHIFT) | HTTC_RSP_PASS_PW | (3 << HTTC_MED_PRI_BYP_CNT_SHIFT) | (3 << HTTC_HI_PRI_BYP_CNT_SHIFT); pci_write_config32(dev, 0x68, val); if (rev_a0) { pci_write_config32(dev, 0x94, 0); pci_write_config32(dev, 0xb4, 0); pci_write_config32(dev, 0xd4, 0); } } #if 0 print_debug("done\r\n"); #endif } static int apply_cpu_errata_fixes(unsigned cpus, int needs_reset) { unsigned node; for(node = 0; node < cpus; node++) { device_t dev; uint32_t cmd; dev = NODE_MC(node); if (is_cpu_pre_c0()) { /* Errata 66 * Limit the number of downstream posted requests to 1 */ cmd = pci_read_config32(dev, 0x70); if ((cmd & (3 << 0)) != 2) { cmd &= ~(3<<0); cmd |= (2<<0); pci_write_config32(dev, 0x70, cmd ); needs_reset = 1; } cmd = pci_read_config32(dev, 0x7c); if ((cmd & (3 << 4)) != 0) { cmd &= ~(3<<4); cmd |= (0<<4); pci_write_config32(dev, 0x7c, cmd ); needs_reset = 1; } /* Clock Power/Timing Low */ cmd = pci_read_config32(dev, 0xd4); if (cmd != 0x000D0001) { cmd = 0x000D0001; pci_write_config32(dev, 0xd4, cmd); needs_reset = 1; /* Needed? */ } } else { uint32_t cmd_ref; /* Errata 98 * Set Clk Ramp Hystersis to 7 * Clock Power/Timing Low */ cmd_ref = 0x04e20707; /* Registered */ cmd = pci_read_config32(dev, 0xd4); if(cmd != cmd_ref) { pci_write_config32(dev, 0xd4, cmd_ref ); needs_reset = 1; /* Needed? */ } } } return needs_reset; } static int optimize_link_read_pointers(unsigned cpus, int needs_reset) { unsigned node; for(node = 0; node < cpus; node = node + 1) { device_t f0_dev, f3_dev; uint32_t cmd_ref, cmd; int link; f0_dev = NODE_HT(node); f3_dev = NODE_MC(node); cmd_ref = cmd = pci_read_config32(f3_dev, 0xdc); for(link = 0; link < 3; link = link + 1) { uint32_t link_type; unsigned reg; reg = 0x98 + (link * 0x20); link_type = pci_read_config32(f0_dev, reg); if (link_type & LinkConnected) { cmd &= 0xff << (link *8); /* FIXME this assumes the device on the other side is an AMD device */ cmd |= 0x25 << (link *8); } } if (cmd != cmd_ref) { pci_write_config32(f3_dev, 0xdc, cmd); needs_reset = 1; } } return needs_reset; } static int setup_coherent_ht_domain(void) { struct setup_smp_result result; result.cpus = 1; result.needs_reset = 0; enable_bsp_routing(); #if CONFIG_MAX_CPUS == 1 setup_uniprocessor(); #else result = setup_smp(); result.cpus = verify_mp_capabilities(result.cpus); #endif coherent_ht_finalize(result.cpus); result.needs_reset = apply_cpu_errata_fixes(result.cpus, result.needs_reset); #if CONFIG_MAX_CPUS > 1 /* Why doesn't this work on the solo? */ result.needs_reset = optimize_link_read_pointers(result.cpus, result.needs_reset); #endif return result.needs_reset; }