amdk8: This patch fixes ram init problems when using the 9W Sempron part.
[coreboot.git] / src / northbridge / amd / amdk8 / raminit_f_dqs.c
1 /*
2  * This file is part of the coreboot project.
3  *
4  * Copyright (C) 2005 YingHai Lu
5  * Copyright (C) 2008 Advanced Micro Devices, Inc.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; version 2 of the License.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19 */
20
21 //0: mean no debug info
22 #define DQS_TRAIN_DEBUG 0
23
24 static inline void print_debug_dqs(const char *str, unsigned val, unsigned level)
25 {
26 #if DQS_TRAIN_DEBUG > 0
27         if(DQS_TRAIN_DEBUG > level) {
28                 #if CONFIG_USE_PRINTK_IN_CAR
29                 printk_debug("%s%x\r\n", str, val);
30                 #else
31                 print_debug(str); print_debug_hex32(val); print_debug("\r\n");
32                 #endif
33         }
34 #endif
35 }
36
37 static inline void print_debug_dqs_pair(const char *str, unsigned val, const char *str2, unsigned val2, unsigned level)
38 {
39 #if DQS_TRAIN_DEBUG > 0
40         if(DQS_TRAIN_DEBUG > level) {
41                 #if CONFIG_USE_PRINTK_IN_CAR
42                 printk_debug("%s%08x%s%08x\r\n", str, val, str2, val2);
43                 #else
44                 print_debug(str); print_debug_hex32(val); print_debug(str2); print_debug_hex32(val2); print_debug("\r\n");
45                 #endif
46         }
47 #endif
48 }
49
50 static inline void print_debug_dqs_tsc(const char *str, unsigned i, unsigned val, unsigned val2, unsigned level)
51 {
52 #if DQS_TRAIN_DEBUG > 0
53         if(DQS_TRAIN_DEBUG > level) {
54                 #if CONFIG_USE_PRINTK_IN_CAR
55                 printk_debug("%s[%02x]=%08x%08x\r\n", str, i, val, val2);
56                 #else
57                 print_debug(str); print_debug("["); print_debug_hex8(i); print_debug("]="); print_debug_hex32(val); print_debug_hex32(val2); print_debug("\r\n");
58                 #endif
59         }
60 #endif
61 }
62
63 static inline void print_debug_dqs_tsc_x(const char *str, unsigned i, unsigned val, unsigned val2)
64 {
65         #if CONFIG_USE_PRINTK_IN_CAR
66         printk_debug("%s[%02x]=%08x%08x\r\n", str, i, val, val2);
67         #else
68         print_debug(str); print_debug("["); print_debug_hex8(i); print_debug("]="); print_debug_hex32(val); print_debug_hex32(val2); print_debug("\r\n");
69         #endif
70
71 }
72
73 static void fill_mem_cs_sysinfo(unsigned nodeid, const struct mem_controller *ctrl, struct sys_info *sysinfo)
74 {
75
76         int i;
77         sysinfo->mem_base[nodeid] = pci_read_config32(ctrl->f1, 0x40 + (nodeid<<3));
78
79         for(i=0;i<8; i++) {
80                 sysinfo->cs_base[nodeid*8+i] = pci_read_config32(ctrl->f2, 0x40 + (i<<2));
81         }
82
83         sysinfo->hole_reg[nodeid] = pci_read_config32(ctrl->f1, 0xf0);
84
85 }
86 static unsigned Get_MCTSysAddr(const struct mem_controller *ctrl,  unsigned cs_idx, struct sys_info *sysinfo)
87 {
88         uint32_t dword;
89         uint32_t mem_base;
90         unsigned nodeid = ctrl->node_id;
91
92 #if HW_MEM_HOLE_SIZEK != 0
93         uint32_t hole_reg;
94 #endif
95
96         //get the local base addr of the chipselect
97         dword = sysinfo->cs_base[nodeid * 8 + cs_idx];
98         dword &= 0xfffffff0;
99
100         //sys addr= node base + local cs base
101         mem_base = sysinfo->mem_base[nodeid];
102         mem_base &= 0xffff0000;
103
104         dword += mem_base;
105 #if HW_MEM_HOLE_SIZEK != 0
106         hole_reg = sysinfo->hole_reg[nodeid];
107         if(hole_reg & 1) {
108                 unsigned hole_startk;
109                 hole_startk = (hole_reg & (0xff<<24)) >> 10;
110                 if( (dword >= (hole_startk<<2)) && (dword < ((4*1024*1024)<<2))) {
111                         dword += ((4*1024*1024 - hole_startk)<<2);
112                 }
113         }
114 #endif
115
116         //add 1MB offset to avoid compat area
117         dword += (1<<(20-8));
118
119         //So final result is upper 32 bit addr
120
121         return dword;
122
123 }
124
125 static unsigned Get_RcvrSysAddr(const struct mem_controller * ctrl, unsigned channel, unsigned cs_idx, struct sys_info *sysinfo)
126 {
127         return Get_MCTSysAddr(ctrl, cs_idx, sysinfo);
128
129 }
130
131 static inline unsigned long read_cr4(void)
132 {
133         unsigned long cr4;
134         asm volatile ("movl %%cr4, %0" : "=r" (cr4));
135         return cr4;
136 }
137
138 static inline void write_cr4(unsigned long cr4)
139 {
140         asm volatile ("movl %0, %%cr4" : : "r" (cr4));
141 }
142
143
144 static inline void enable_sse2()
145 {
146         unsigned long cr4;
147         cr4 = read_cr4();
148         cr4 |= (1<<9);
149         write_cr4(cr4);
150 }
151
152 static inline void disable_sse2()
153 {
154         unsigned long cr4;
155         cr4 = read_cr4();
156         cr4 &= ~(1<<9);
157         write_cr4(cr4);
158 }
159
160
161 static void set_wrap32dis(void) {
162         msr_t msr;
163
164         msr = rdmsr(0xc0010015);
165         msr.lo |= (1<<17);
166
167         wrmsr(0xc0010015, msr);
168
169 }
170
171 static void clear_wrap32dis(void) {
172         msr_t msr;
173
174         msr = rdmsr(0xc0010015);
175         msr.lo &= ~(1<<17);
176
177         wrmsr(0xc0010015, msr);
178
179 }
180
181 static void set_FSBASE(uint32_t addr_hi)
182 {
183         msr_t msr;
184
185         //set fs and use fs prefix to access the mem
186         msr.hi = addr_hi;
187         msr.lo = 0;
188         wrmsr(0xc0000100, msr); //FS_BASE
189
190 }
191
192 static unsigned ChipSelPresent(const struct mem_controller *ctrl, unsigned cs_idx, struct sys_info *sysinfo)
193 {
194         unsigned enabled;
195         unsigned nodeid = ctrl->node_id;
196
197
198         enabled = sysinfo->cs_base[nodeid * 8 + cs_idx];
199         enabled &= 1;
200
201         return enabled;
202
203 }
204
205 static unsigned RcvrRankEnabled(const struct mem_controller *ctrl, int channel, int cs_idx, unsigned is_Width128, struct sys_info *sysinfo)
206 {
207         return ChipSelPresent(ctrl, cs_idx, sysinfo);
208 }
209
210 static void WriteLNTestPattern(unsigned addr_lo, uint8_t *buf_a, unsigned line_num)
211 {
212         __asm__ volatile (
213                 "1:\n\t"
214                 "movdqa (%3), %%xmm0\n\t"
215                 "movntdq %%xmm0, %%fs:(%0)\n\t" /* xmm0 is 128 bit */
216                 "addl %1, %0\n\t"
217                 "addl %1, %3\n\t"
218                 "loop 1b\n\t"
219
220                 :: "a" (addr_lo), "d" (16), "c" (line_num * 4), "b"(buf_a)
221         );
222
223
224 }
225
226 static void Write1LTestPattern(unsigned addr, unsigned p, uint8_t *buf_a, uint8_t *buf_b)
227 {
228         uint8_t *buf;
229         if(p==1) { buf = buf_b; }
230         else { buf = buf_a; }
231
232         set_FSBASE (addr>>24);
233
234         WriteLNTestPattern(addr<<8, buf, 1);
235 }
236
237 static void Read1LTestPattern(unsigned addr)
238 {
239         unsigned value;
240
241         set_FSBASE(addr>>24);
242
243         /* 1st move causes read fill (to exclusive or shared)*/
244         __asm__ volatile (
245                 "movl %%fs:(%1), %0\n\t"
246                 :"=b"(value): "a" (addr<<8)
247         );
248
249 }
250
251 #define DQS_PASS 0
252 #define DQS_FAIL 1
253
254 #define DQS_FIRST_PASS 1
255 #define DQS_SECOND_PASS 2
256
257 #define SB_NORCVREN 11
258 #define RCVREN_MARGIN 6
259 #define SB_SmallRCVR 13
260 #define SB_CHA2BRCVREN 12
261 #define SB_NODQSPOS  14
262 #define MIN_DQS_WNDW 3
263 #define SB_SMALLDQS 15
264
265
266 static unsigned CompareTestPatternQW0(unsigned channel, unsigned addr, unsigned pattern, const uint32_t *TestPattern0, const uint32_t *TestPattern1, const uint32_t *TestPattern2, unsigned Pass, unsigned is_Width128)
267 {
268         uint32_t addr_lo;
269         uint32_t *test_buf;
270         uint32_t value;
271         uint32_t value_test;
272         unsigned result = DQS_FAIL;
273
274         if(Pass == DQS_FIRST_PASS) {
275                 if(pattern==1) {
276                         test_buf = (uint32_t *)TestPattern1;
277                 }
278                 else {
279                         test_buf = (uint32_t *)TestPattern0;
280                 }
281         }
282         else {
283                 test_buf = (uint32_t *)TestPattern2;
284         }
285
286         set_FSBASE(addr>>24);
287
288         addr_lo = addr<<8;
289
290         if(is_Width128 && (channel == 1)) {
291                 addr_lo += 8; //second channel
292                 test_buf += 2;
293         }
294
295         __asm__ volatile (
296                 "movl %%fs:(%1), %0\n\t"
297                 :"=b"(value): "a" (addr_lo)
298         );
299
300         value_test = *test_buf;
301
302
303         print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4);
304         print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : addr_lo = ", addr_lo, " value = ", value, 4);
305
306         if(value == value_test) {
307                 addr_lo += 4;
308                 test_buf++;
309                 __asm__ volatile (
310                         "movl %%fs:(%1), %0\n\t"
311                         :"=b"(value): "a" (addr_lo)
312                 );
313                 value_test = *test_buf;
314                 print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4);
315                 print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : addr_lo = ", addr_lo, " value = ", value, 4);
316
317                 if(value == value_test){
318                         result =  DQS_PASS;
319                 }
320         }
321
322         if(Pass == DQS_SECOND_PASS) { // second pass need to be inverted
323                 if(result==DQS_PASS) {
324                         result = DQS_FAIL;
325                 }
326                 else {
327                         result = DQS_PASS;
328                 }
329         }
330
331         return result;
332
333 }
334
335 static void SetMaxAL_RcvrDly(const struct mem_controller *ctrl, unsigned dly)
336 {
337         uint32_t reg;
338
339         dly += (20-1); // round it
340         dly /= 20; // convert from unit 50ps to 1ns
341
342         dly += 6;
343
344
345         reg = pci_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
346         reg &= ~(DCH_MaxAsyncLat_MASK <<DCH_MaxAsyncLat_SHIFT);
347         reg |= ((dly - DCH_MaxAsyncLat_BASE) << DCH_MaxAsyncLat_SHIFT);
348         pci_write_config32(ctrl->f2, DRAM_CONFIG_HIGH, reg);
349
350 }
351
352 /*
353         Set the Target range to WT IO (using an IORR overlapping the already existing
354         WB dram type). Use IORR0
355 */
356 static void SetTargetWTIO(unsigned addr)
357 {
358         msr_t msr;
359         msr.hi = addr>>24;
360         msr.lo = addr<<8;
361         wrmsr(0xc0010016, msr); //IORR0 BASE
362
363         msr.hi = 0xff;
364         msr.lo = 0xfc000800;  // 64MB Mask
365         wrmsr(0xc0010017, msr); // IORR0 Mask
366 }
367
368 static void ResetTargetWTIO(void)
369 {
370         msr_t msr;
371
372         msr.hi = 0;
373         msr.lo = 0;
374         wrmsr(0xc0010017, msr); // IORR0 Mask
375 }
376
377 static void proc_CLFLUSH(unsigned addr)
378 {
379
380         set_FSBASE(addr>>24);
381
382         /* 1st move causes read fill (to exclusive or shared)*/
383         __asm__ volatile (
384                         /* clflush fs:[eax] */
385                 "clflush %%fs:(%0)\n\t"
386                 ::"a" (addr<<8)
387         );
388
389 }
390 static void proc_IOCLFLUSH(unsigned addr)
391 {
392         SetTargetWTIO(addr);
393         proc_CLFLUSH(addr);
394         ResetTargetWTIO();
395 }
396
397 static void ResetDCTWrPtr(const struct mem_controller *ctrl)
398 {
399         uint32_t dword;
400         unsigned index = 0x10;
401
402         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
403         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
404
405         index += 0x20;
406         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
407         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
408
409 }
410
411
412 static uint16_t get_exact_T1000(unsigned i)
413 {
414         //                                 200   266,   333,  400
415         static const uint16_t T1000_a[]= { 5000, 3759, 3003, 2500 };
416
417         static const uint16_t TT_a[] = {
418                  /*200   266   333   400 */
419          /*4 */   6250, 6250, 6250, 6250,
420          /*5 */   5000, 5000, 5000, 2500,
421          /*6 */   5000, 4166, 4166, 2500,
422          /*7 */   5000, 4285, 3571, 2500,
423
424          /*8 */   5000, 3750, 3125, 2500,
425          /*9 */   5000, 3888, 3333, 2500,
426          /*10*/   5000, 4000, 3000, 2500,
427          /*11*/   5000, 4090, 3181, 2500,
428
429          /*12*/   5000, 3750, 3333, 2500,
430          /*13*/   5000, 3846, 3076, 2500,
431          /*14*/   5000, 3928, 3214, 2500,
432          /*15*/   5000, 4000, 3000, 2500,
433         };
434
435         int index;
436         msr_t msr;
437
438         /* Check for FID control support */
439         struct cpuid_result cpuid1;
440         cpuid1 = cpuid(0x8000007);
441         if( cpuid1.edx & 0x02 ) {
442                 /* Use current FID */
443                 unsigned fid_cur;
444                 msr = rdmsr(0xc0010042);
445                 fid_cur = msr.lo & 0x3f;
446
447                 index = fid_cur>>1;
448         } else {
449                 /* Use startup FID */
450                 unsigned fid_start;
451                 msr = rdmsr(0xc0010015);
452                 fid_start = (msr.lo & (0x3f << 24));
453                 
454                 index = fid_start>>25;
455         }
456
457         if(index>12) return T1000_a[i];
458
459         return TT_a[index * 4+i];
460
461 }
462
463 static void InitDQSPos4RcvrEn(const struct mem_controller *ctrl)
464 {
465         int i;
466         uint32_t dword;
467
468         dword = 0x00000000;
469         for(i=1; i<=3; i++) {
470                 /* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x01-0x03, 0x21-0x23) to 0x00 for all bytes */
471                 pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
472                 pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
473         }
474
475         dword = 0x2f2f2f2f;
476         for(i=5; i<=7; i++) {
477                 /* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x05-0x07, 0x25-0x27) to 0x2f for all bytes */
478                 pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
479                 pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
480         }
481
482
483 }
484 #ifndef K8_REV_F_SUPPORT_F0_F1_WORKAROUND
485 #define K8_REV_F_SUPPORT_F0_F1_WORKAROUND 1
486 #endif
487
488 static unsigned TrainRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
489 {
490
491         static const uint32_t TestPattern0[] = {
492                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
493                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
494                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
495                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
496                 };
497         static const uint32_t TestPattern1[] = {
498                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
499                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
500                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
501                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
502                 };
503         static const uint32_t TestPattern2[] = {
504                         0x12345678, 0x87654321, 0x23456789, 0x98765432,
505                         0x59385824, 0x30496724, 0x24490795, 0x99938733,
506                         0x40385642, 0x38465245, 0x29432163, 0x05067894,
507                         0x12349045, 0x98723467, 0x12387634, 0x34587623,
508                 };
509
510         uint8_t pattern_buf_x[64 * 4 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */
511         uint8_t *buf_a, *buf_b;
512         uint32_t ecc_bit;
513         uint32_t dword;
514         uint8_t *dqs_rcvr_dly_a = &sysinfo->dqs_rcvr_dly_a[ctrl->node_id * 2* 8] ; //8 node, channel 2, receiver 8
515
516         int i;
517
518         unsigned channel, receiver;
519
520         unsigned Errors;
521         unsigned CTLRMaxDelay;
522         unsigned T1000;
523
524         unsigned LastTest;
525         unsigned CurrTest;
526         unsigned Test0, Test1;
527
528         unsigned RcvrEnDlyRmin;
529
530         unsigned two_ranks;
531         unsigned RcvrEnDly;
532
533         unsigned PatternA;
534         unsigned PatternB;
535
536         unsigned TestAddr0, TestAddr0B, TestAddr1, TestAddr1B = 0;
537
538         unsigned CurrRcvrCHADelay = 0;
539
540         unsigned tmp;
541
542         unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
543
544         unsigned cpu_f0_f1;
545
546         if(Pass == DQS_FIRST_PASS) {
547                 InitDQSPos4RcvrEn(ctrl);
548         }
549
550         //enable SSE2
551         enable_sse2();
552
553         //wrap32dis
554         set_wrap32dis();
555
556         //disable ECC temp
557         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
558         ecc_bit = dword & DCL_DimmEccEn;
559         dword &= ~(DCL_DimmEccEn);
560         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
561
562
563         if(Pass == DQS_FIRST_PASS) {
564 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
565         cpu_f0_f1 = is_cpu_pre_f2_in_bsp(ctrl->node_id);
566         if(!cpu_f0_f1)
567 #endif
568         {
569 #if 1
570                 /* Set the DqsRcvEnTrain bit */
571                 dword = pci_read_config32(ctrl->f2, DRAM_CTRL);
572                 dword |= DC_DqsRcvEnTrain;
573                 pci_write_config32(ctrl->f2, DRAM_CTRL, dword);
574 #endif
575         }
576         }
577
578         //get T1000 figures (cycle time (ns)) * 1K
579         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
580         dword &= DCH_MemClkFreq_MASK;
581
582         T1000 = get_exact_T1000(dword);
583
584         // SetupRcvrPattern
585         buf_a = (uint8_t *)(((uint32_t)(&pattern_buf_x[0]) + 0x10) & (0xfffffff0));
586         buf_b = buf_a + 128; //??
587         if(Pass==DQS_FIRST_PASS) {
588                 for(i=0;i<16;i++) {
589                         *((uint32_t *)(buf_a + i*4)) = TestPattern0[i];
590                         *((uint32_t *)(buf_b + i*4)) = TestPattern1[i];
591                 }
592         }
593         else {
594                 for(i=0;i<16;i++) {
595                         *((uint32_t *)(buf_a + i*4)) = TestPattern2[i];
596                         *((uint32_t *)(buf_b + i*4)) = TestPattern2[i];
597                 }
598         }
599
600         print_debug_dqs("\r\nTrainRcvEn: 0 ctrl", ctrl->node_id, 0);
601
602         print_debug_addr("TrainRcvEn: buf_a:", buf_a);
603
604         Errors = 0;
605         /* for each channel */
606         CTLRMaxDelay = 0;
607         channel = 0;
608
609         if (!(sysinfo->meminfo[ctrl->node_id].dimm_mask & 0x0F) &&
610              (sysinfo->meminfo[ctrl->node_id].dimm_mask & 0xF0)) { /* channelB only? */
611                 channel = 1;
612         }
613
614         for ( ; (channel < 2) && (!Errors); channel++)
615         { 
616                 print_debug_dqs("\tTrainRcvEn51: channel ",channel, 1); 
617                 
618                 /* for each rank */ 
619                 /* there are four recriver pairs, loosely associated with CS */ 
620                 for( receiver = 0; (receiver < 8) && (!Errors); receiver+=2) 
621                 {
622
623                         unsigned index=(receiver>>1) * 3 + 0x10;
624
625                         print_debug_dqs("\t\tTrainRcvEn52: index ", index, 2);
626
627                         if(is_Width128) {
628                                 if(channel) {
629                                         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
630                                         CurrRcvrCHADelay= dword & 0xff;
631                                 }
632                         }
633                         else {
634                                 if(channel) {
635                                         index += 0x20;
636                                 }
637                         }
638
639                         LastTest = DQS_FAIL;
640                         RcvrEnDlyRmin = 0xaf;
641
642                         if(!RcvrRankEnabled(ctrl, channel, receiver, is_Width128, sysinfo)) continue;
643
644                         /* for each DQS receiver enable setting */
645
646                         TestAddr0 = Get_RcvrSysAddr(ctrl, channel, receiver, sysinfo);
647
648                         TestAddr0B = TestAddr0 + (1<<(20+2-8)); // 4MB
649
650                         if(RcvrRankEnabled(ctrl, channel, receiver+1, is_Width128, sysinfo)) {
651                                 TestAddr1 = Get_RcvrSysAddr(ctrl, channel, receiver+1, sysinfo);
652                                 TestAddr1B = TestAddr1 + (1<<(20+2-8)); //4MB
653                                 two_ranks = 1;
654                         }
655                         else {
656                                 two_ranks = 0;
657                         }
658
659                         print_debug_dqs("\t\tTrainRcvEn53: TestAddr0B ", TestAddr0B, 2);
660
661                         Write1LTestPattern(TestAddr0, 0, buf_a, buf_b); // rank0 of dimm, test p0
662                         Write1LTestPattern(TestAddr0B, 1, buf_a, buf_b); //rank0 of dimm, test p1
663
664                         if(two_ranks == 1) {
665                                 Write1LTestPattern(TestAddr1, 0, buf_a, buf_b); //rank 1 of dimm
666                                 Write1LTestPattern(TestAddr1B, 1, buf_a, buf_b);//rank 1 of dimm
667                         }
668
669                         if(Pass == DQS_FIRST_PASS) {
670                                 RcvrEnDly = 0;
671                         } else {
672                                 RcvrEnDly = dqs_rcvr_dly_a[channel * 8 + receiver];
673                         }
674
675                         while ( RcvrEnDly < 0xaf) { // Sweep Delay value here
676                                 print_debug_dqs("\t\t\tTrainRcvEn541: RcvrEnDly ", RcvrEnDly, 3);
677
678                                 if(RcvrEnDly & 1) {
679                                         /* Odd steps get another pattern such that even
680                                            and odd steps alternate.
681                                            The pointers to the patterns will be swapped
682                                            at the end of the loop so they are correspond
683                                         */
684                                         PatternA = 1;
685                                         PatternB = 0;
686                                 }
687                                 else {
688                                         /* Even step */
689                                         PatternA = 0;
690                                         PatternB = 1;
691                                 }
692
693                                 /* Program current Receiver enable delay */
694                                 pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
695                                 /* FIXME: 64bit MUX */
696
697                                 if(is_Width128) {
698                                         /* Program current Receiver enable delay chaannel b */
699                                         pci_write_config32_index_wait(ctrl->f2, 0x98, index+ 0x20, RcvrEnDly);
700                                 }
701
702                                 /* Program the MaxAsyncLat filed with the
703                                    current DQS receiver enable setting plus 6ns
704                                 */
705                                 /*Porgram MaxAsyncLat to correspond with current delay */
706                                 SetMaxAL_RcvrDly(ctrl, RcvrEnDly);
707
708                                 CurrTest = DQS_FAIL;
709
710                                 Read1LTestPattern(TestAddr0);  //Cache Fill
711                                 /* ROM vs cache compare */
712                                 Test0 = CompareTestPatternQW0(channel, TestAddr0, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
713                                 proc_IOCLFLUSH(TestAddr0);
714
715                                 ResetDCTWrPtr(ctrl);
716
717                                 print_debug_dqs("\t\t\tTrainRcvEn542: Test0 ", Test0, 3);
718
719                                 if(Test0 == DQS_PASS) {
720
721                                         Read1LTestPattern(TestAddr0B);
722                                         Test1 = CompareTestPatternQW0(channel, TestAddr0B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
723                                         proc_IOCLFLUSH(TestAddr0B);
724
725                                         ResetDCTWrPtr(ctrl);
726
727                                         print_debug_dqs("\t\t\tTrainRcvEn543: Test1 ", Test1, 3);
728
729                                         if(Test1 == DQS_PASS) {
730                                                 if(two_ranks) {
731                                                         Read1LTestPattern(TestAddr1);
732                                                         Test0 = CompareTestPatternQW0(channel, TestAddr1, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
733                                                         proc_IOCLFLUSH(TestAddr1);
734                                                         ResetDCTWrPtr(ctrl);
735
736                                                         if(Test0 == DQS_PASS) {
737                                                                 Read1LTestPattern(TestAddr1B);
738                                                                 Test1 = CompareTestPatternQW0(channel, TestAddr1B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
739                                                                 proc_IOCLFLUSH(TestAddr1B);
740                                                                 ResetDCTWrPtr(ctrl);
741
742                                                                 if(Test1 == DQS_PASS) {
743                                                                         CurrTest = DQS_PASS;
744                                                                 }
745                                                         }
746                                                         print_debug_dqs("\t\t\tTrainRcvEn544: Test0 ", Test0, 3);
747                                                 }
748                                                 else {
749                                                         CurrTest = DQS_PASS;
750                                                 }
751                                         }
752                                 }
753
754                                 print_debug_dqs("\t\t\tTrainRcvEn55: RcvrEnDly ", RcvrEnDly, 3);
755
756                                 if(CurrTest == DQS_PASS) {
757                                         if(LastTest == DQS_FAIL) {
758                                                 RcvrEnDlyRmin = RcvrEnDly;
759                                                 break;
760                                         }
761                                 }
762
763                                 LastTest = CurrTest;
764
765                                 /* swap the rank 0 pointers */
766                                 tmp = TestAddr0;
767                                 TestAddr0 = TestAddr0B;
768                                 TestAddr0B = tmp;
769
770                                 /* swap the rank 1 pointers */
771                                 tmp = TestAddr1;
772                                 TestAddr1 = TestAddr1B;
773                                 TestAddr1B = tmp;
774
775                                 print_debug_dqs("\t\t\tTrainRcvEn56: RcvrEnDly ", RcvrEnDly, 3);
776
777                                 RcvrEnDly++;
778
779                         } // while RcvrEnDly
780
781                         print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDly ", RcvrEnDly, 2);
782
783                         if(RcvrEnDlyRmin == 0xaf) {
784                                 //no passing window
785                                 Errors |= SB_NORCVREN;
786                         }
787
788                         if(Pass == DQS_FIRST_PASS) {
789                                 // We need a better value for DQSPos trainning
790                                 RcvrEnDly = RcvrEnDlyRmin /* + RCVREN_MARGIN * T1000/64/50 */;
791                         } else {
792                                 RcvrEnDly = RcvrEnDlyRmin;
793                         }
794
795                         if(RcvrEnDly > 0xae) {
796                                 //passing window too narrow, too far delayed
797                                 Errors |= SB_SmallRCVR;
798                                 RcvrEnDly = 0xae;
799                         }
800
801                         if(Pass == DQS_SECOND_PASS) { //second pass must average vales
802                                 RcvrEnDly += dqs_rcvr_dly_a[channel * 8 + receiver] /* - (RCVREN_MARGIN * T1000/64/50)*/;
803                                 RcvrEnDly >>= 1;
804                         }
805
806                         dqs_rcvr_dly_a[channel * 8 + receiver] = RcvrEnDly;
807
808                         //Set final RcvrEnDly for this DIMM and Channel
809                         pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
810
811                         if(is_Width128) {
812                                 pci_write_config32_index_wait(ctrl->f2, 0x98, index+0x20, RcvrEnDly); // channel B
813                                 if(channel) {
814                                         pci_write_config32_index_wait(ctrl->f2, 0x98, index, CurrRcvrCHADelay);
815                                         if(RcvrEnDly > CurrRcvrCHADelay) {
816                                                 dword = RcvrEnDly - CurrRcvrCHADelay;
817                                         }
818                                         else {
819                                                 dword = CurrRcvrCHADelay - RcvrEnDly;
820                                         }
821                                         dword *= 50;
822                                         if(dword > T1000) {
823                                                 Errors |= SB_CHA2BRCVREN;
824                                         }
825                                 }
826                         }
827
828                         print_debug_dqs("\t\tTrainRcvEn63: RcvrEnDly ", RcvrEnDly, 2);
829
830                         if(RcvrEnDly > CTLRMaxDelay) {
831                                 CTLRMaxDelay = RcvrEnDly;
832                         }
833
834                         print_debug_dqs("\t\tTrainRcvEn64: CTLRMaxDelay ", CTLRMaxDelay, 2);
835
836                 } /* receiver */
837         } /* channel */
838
839         print_debug_dqs("\tTrainRcvEn65: CTLRMaxDelay ", CTLRMaxDelay, 1);
840
841         /* Program the MaxAsysncLat field with the largest DQS Receiver Enable setting */
842         SetMaxAL_RcvrDly(ctrl, CTLRMaxDelay);
843         ResetDCTWrPtr(ctrl);
844
845         //Enable ECC again
846         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
847         dword &= ~(DCL_DimmEccEn);
848         dword |= ecc_bit;
849         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
850
851         if(Pass == DQS_FIRST_PASS) {
852 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
853         if(!cpu_f0_f1)
854 #endif
855         {
856                 dword = pci_read_config32(ctrl->f2, DRAM_CTRL);
857                 dword &= ~DC_DqsRcvEnTrain;
858                 pci_write_config32(ctrl->f2, DRAM_CTRL, dword);
859         }
860         }
861
862         //Clear wrap32dis
863
864         clear_wrap32dis();
865
866         //restore SSE2 setting
867         disable_sse2();
868
869 #if MEM_TRAIN_SEQ != 1
870         /* We need tidy output for type 1 */
871         #if CONFIG_USE_PRINTK_IN_CAR
872         printk_debug(" CTLRMaxDelay=%02x", CTLRMaxDelay);
873         #else
874         print_debug(" CTLRMaxDelay="); print_debug_hex8(CTLRMaxDelay);
875         #endif
876 #endif
877
878         return (CTLRMaxDelay==0xae)?1:0;
879
880 }
881
882 #define DQS_READDIR 1
883 #define DQS_WRITEDIR 0
884
885
886 static void SetDQSDelayCSR(const struct mem_controller *ctrl, unsigned channel, unsigned bytelane, unsigned direction, unsigned dqs_delay)
887 { //ByteLane could be 0-8, last is for ECC
888         unsigned index;
889         uint32_t dword;
890         unsigned shift;
891
892         dqs_delay &= 0xff;
893
894         index = (bytelane>>2) + 1 + channel * 0x20 + (direction << 2);
895         shift = bytelane;
896         while(shift>3) {
897                 shift-=4;
898         }
899         shift <<= 3; // 8 bit
900
901         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
902         dword &= ~(0x3f<<shift);
903         dword |= (dqs_delay<<shift);
904         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
905
906 }
907
908 static void SetDQSDelayAllCSR(const struct mem_controller *ctrl, unsigned channel, unsigned direction, unsigned dqs_delay)
909 {
910         unsigned index;
911         uint32_t dword;
912         int i;
913
914         dword = 0;
915         dqs_delay &= 0xff;
916         for(i=0;i<4;i++) {
917                 dword |= dqs_delay<<(i*8);
918         }
919
920         index = 1 + channel * 0x20 + direction * 4;
921
922         for(i=0; i<2; i++) {
923                 pci_write_config32_index_wait(ctrl->f2, 0x98, index + i, dword);
924         }
925
926 }
927
928 static unsigned MiddleDQS(unsigned min_d, unsigned max_d)
929 {
930         unsigned size_d;
931         size_d = max_d-min_d;
932         if(size_d & 1) { //need round up
933                 min_d++;
934         }
935         return ( min_d + (size_d>>1));
936 }
937
938 static  inline void save_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, uint8_t *dqs_delay_a, uint8_t dqs_delay)
939 {
940         dqs_delay_a[channel * 2*9 + direction * 9 + bytelane] = dqs_delay;
941 }
942
943 static void WriteDQSTestPattern(unsigned addr_lo, unsigned pattern , uint8_t *buf_a)
944 {
945         WriteLNTestPattern(addr_lo, buf_a, (pattern+1) * 9);
946 }
947
948 static void ReadL18TestPattern(unsigned addr_lo)
949 {
950         //set fs and use fs prefix to access the mem
951         __asm__ volatile (
952                 "movl %%fs:-128(%%esi), %%eax\n\t"  //TestAddr cache line
953                 "movl %%fs:-64(%%esi), %%eax\n\t"   //+1
954                 "movl %%fs:(%%esi), %%eax\n\t"  //+2
955                 "movl %%fs:64(%%esi), %%eax\n\t"   //+3
956
957                 "movl %%fs:-128(%%edi), %%eax\n\t"      //+4
958                 "movl %%fs:-64(%%edi), %%eax\n\t"       //+5
959                 "movl %%fs:(%%edi), %%eax\n\t"  //+6
960                 "movl %%fs:64(%%edi), %%eax\n\t"        //+7
961
962                 "movl %%fs:-128(%%ebx), %%eax\n\t"  //+8
963                 "movl %%fs:-64(%%ebx), %%eax\n\t"       //+9
964                 "movl %%fs:(%%ebx), %%eax\n\t"  //+10
965                 "movl %%fs:64(%%ebx), %%eax\n\t"        //+11
966
967                 "movl %%fs:-128(%%ecx), %%eax\n\t"      //+12
968                 "movl %%fs:-64(%%ecx), %%eax\n\t"       //+13
969                 "movl %%fs:(%%ecx), %%eax\n\t"  //+14
970                 "movl %%fs:64(%%ecx), %%eax\n\t"        //+15
971
972                 "movl %%fs:-128(%%edx), %%eax\n\t"      //+16
973                 "movl %%fs:-64(%%edx), %%eax\n\t"       //+17
974
975                 :: "a"(0), "b" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "S"(addr_lo+128), "D"(addr_lo+128+4*64)
976         );
977
978 }
979
980 static void ReadL9TestPattern(unsigned addr_lo)
981 {
982
983         //set fs and use fs prefix to access the mem
984         __asm__ volatile (
985
986                 "movl %%fs:-128(%%ecx), %%eax\n\t"  //TestAddr cache line
987                 "movl %%fs:-64(%%ecx), %%eax\n\t"   //+1
988                 "movl %%fs:(%%ecx), %%eax\n\t"      //+2
989                 "movl %%fs:64(%%ecx), %%eax\n\t"   //+3
990
991                 "movl %%fs:-128(%%edx), %%eax\n\t"  //+4
992                 "movl %%fs:-64(%%edx), %%eax\n\t"   //+5
993                 "movl %%fs:(%%edx), %%eax\n\t"      //+6
994                 "movl %%fs:64(%%edx), %%eax\n\t"   //+7
995
996                 "movl %%fs:-128(%%ebx), %%eax\n\t"      //+8
997
998                 :: "a"(0), "b" (addr_lo+128+8*64), "c"(addr_lo+128), "d"(addr_lo+128+4*64)
999         );
1000
1001 }
1002
1003
1004 static void ReadDQSTestPattern(unsigned addr_lo, unsigned pattern)
1005 {
1006         if(pattern == 0) {
1007                 ReadL9TestPattern(addr_lo);
1008         }
1009         else {
1010                 ReadL18TestPattern(addr_lo);
1011         }
1012 }
1013
1014 static void FlushDQSTestPattern_L9(unsigned addr_lo)
1015 {
1016         __asm__ volatile (
1017                 "clflush %%fs:-128(%%ecx)\n\t"
1018                 "clflush %%fs:-64(%%ecx)\n\t"
1019                 "clflush %%fs:(%%ecx)\n\t"
1020                 "clflush %%fs:64(%%ecx)\n\t"
1021
1022                 "clflush %%fs:-128(%%eax)\n\t"
1023                 "clflush %%fs:-64(%%eax)\n\t"
1024                 "clflush %%fs:(%%eax)\n\t"
1025                 "clflush %%fs:64(%%eax)\n\t"
1026
1027                 "clflush %%fs:-128(%%ebx)\n\t"
1028
1029                 ::  "b" (addr_lo+128+8*64), "c"(addr_lo+128), "a"(addr_lo+128+4*64)
1030         );
1031
1032 }
1033 static __attribute__((noinline)) void FlushDQSTestPattern_L18(unsigned addr_lo)
1034 {
1035        __asm__ volatile (
1036                 "clflush %%fs:-128(%%eax)\n\t"
1037                 "clflush %%fs:-64(%%eax)\n\t"
1038                 "clflush %%fs:(%%eax)\n\t"
1039                 "clflush %%fs:64(%%eax)\n\t"
1040
1041                 "clflush %%fs:-128(%%edi)\n\t"
1042                 "clflush %%fs:-64(%%edi)\n\t"
1043                 "clflush %%fs:(%%edi)\n\t"
1044                 "clflush %%fs:64(%%edi)\n\t"
1045
1046                 "clflush %%fs:-128(%%ebx)\n\t"
1047                 "clflush %%fs:-64(%%ebx)\n\t"
1048                 "clflush %%fs:(%%ebx)\n\t"
1049                 "clflush %%fs:64(%%ebx)\n\t"
1050
1051                 "clflush %%fs:-128(%%ecx)\n\t"
1052                 "clflush %%fs:-64(%%ecx)\n\t"
1053                 "clflush %%fs:(%%ecx)\n\t"
1054                 "clflush %%fs:64(%%ecx)\n\t"
1055
1056                 "clflush %%fs:-128(%%edx)\n\t"
1057                 "clflush %%fs:-64(%%edx)\n\t"
1058
1059                 :: "b" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "a"(addr_lo+128), "D"(addr_lo+128+4*64)
1060         );
1061 }
1062
1063 static void FlushDQSTestPattern(unsigned addr_lo, unsigned pattern )
1064 {
1065
1066         if(pattern == 0){
1067                 FlushDQSTestPattern_L9(addr_lo);
1068         }
1069         else {
1070                 FlushDQSTestPattern_L18(addr_lo);
1071         }
1072 }
1073
1074 static unsigned CompareDQSTestPattern(unsigned channel, unsigned addr_lo, unsigned pattern, uint8_t *buf_a)
1075 {
1076         uint32_t *test_buf;
1077         unsigned bitmap = 0xff;
1078         unsigned bytelane;
1079         int i;
1080         uint32_t value;
1081         int j;
1082         uint32_t value_test;
1083
1084         test_buf = (uint32_t *)buf_a;
1085
1086
1087         if(pattern && channel) {
1088                 addr_lo += 8; //second channel
1089                 test_buf+= 2;
1090         }
1091
1092         bytelane = 0;
1093         for(i=0;i<9*64/4;i++) {
1094                 __asm__ volatile (
1095                         "movl %%fs:(%1), %0\n\t"
1096                         :"=b"(value): "a" (addr_lo)
1097                 );
1098                 value_test = *test_buf;
1099
1100                 print_debug_dqs_pair("\t\t\t\t\t\ttest_buf= ", (unsigned)test_buf, " value = ", value_test, 7);
1101                 print_debug_dqs_pair("\t\t\t\t\t\ttaddr_lo = ",addr_lo, " value = ", value, 7);
1102
1103                 for(j=0;j<4*8;j+=8) {
1104                         if(((value>>j)&0xff) != ((value_test>>j)& 0xff)) {
1105                                 bitmap &= ~(1<<bytelane);
1106                         }
1107
1108                         bytelane++;
1109                         bytelane &= 0x7;
1110                 }
1111                 print_debug_dqs("\t\t\t\t\t\tbitmap = ", bitmap, 7);
1112
1113                 if(bytelane == 0) {
1114                         if(pattern == 1) { //dual channel
1115                                 addr_lo += 8; //skip over other channel's data
1116                                 test_buf += 2;
1117                         }
1118                 }
1119                 addr_lo += 4;
1120                 test_buf +=1;
1121
1122         }
1123
1124
1125         return bitmap;
1126
1127 }
1128
1129 static unsigned TrainDQSPos(const struct mem_controller *ctrl, unsigned channel, unsigned Direction, unsigned Pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1130 {
1131         unsigned ByteLane;
1132         unsigned Errors;
1133         unsigned BanksPresent;
1134
1135         unsigned MutualCSPassW[48];
1136
1137         unsigned ChipSel;
1138         unsigned DQSDelay;
1139
1140         unsigned TestAddr;
1141
1142         unsigned LastTest;
1143         unsigned RnkDlyFilterMax, RnkDlyFilterMin = 0;
1144         unsigned RnkDlySeqPassMax, RnkDlySeqPassMin = 0;
1145
1146         Errors = 0;
1147         BanksPresent = 0;
1148
1149         print_debug_dqs("\t\t\tTrainDQSPos begin ", 0, 3);
1150
1151         print_debug_addr("TrainDQSPos: MutualCSPassW[48] :", MutualCSPassW);
1152
1153         for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
1154                 MutualCSPassW[DQSDelay] = 0xff; // Bitmapped status per delay setting, 0xff=All positions passing (1= PASS)
1155         }
1156
1157         for(ChipSel = 0; ChipSel < 8; ChipSel++) { //logical register chipselects 0..7
1158                 print_debug_dqs("\t\t\t\tTrainDQSPos: 11 ChipSel ", ChipSel, 4);
1159                 //FIXME: process 64MUXedMode
1160                 if(!ChipSelPresent(ctrl, ChipSel, sysinfo)) continue;
1161                 BanksPresent  = 1;
1162
1163                 TestAddr = Get_MCTSysAddr(ctrl, ChipSel, sysinfo);
1164
1165                 print_debug_dqs("\t\t\t\tTrainDQSPos: 12 TestAddr ", TestAddr, 4);
1166
1167                 //set fs and use fs prefix to access the mem
1168                 set_FSBASE(TestAddr>>24);
1169
1170                 if(Direction == DQS_READDIR) {
1171                         print_debug_dqs("\t\t\t\tTrainDQSPos: 13 for read so write at first", 0, 4);
1172                         WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a);
1173                 }
1174
1175                 for(DQSDelay = 0; DQSDelay < 48; DQSDelay++ ){
1176                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 141 DQSDelay ", DQSDelay, 5);
1177                         if(MutualCSPassW[DQSDelay] == 0) continue; //skip current delay value if other chipselects have failed all 8 bytelanes
1178                         SetDQSDelayAllCSR(ctrl, channel, Direction, DQSDelay);
1179                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 142 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1180                         if(Direction == DQS_WRITEDIR) {
1181                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 143 for write", 0, 5);
1182                                 WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a);
1183                         }
1184                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 144 Pattern ", Pattern, 5);
1185                         ReadDQSTestPattern(TestAddr<<8, Pattern);
1186                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 145 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1187                         MutualCSPassW[DQSDelay] &= CompareDQSTestPattern(channel, TestAddr<<8, Pattern, buf_a); //0: fail, 1=pass
1188                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 146 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1189                         SetTargetWTIO(TestAddr);
1190                         FlushDQSTestPattern(TestAddr<<8, Pattern);
1191                         ResetTargetWTIO();
1192                 }
1193         }
1194
1195         if(BanksPresent)
1196         for(ByteLane = 0; ByteLane < 8; ByteLane++) {
1197                 print_debug_dqs("\t\t\t\tTrainDQSPos: 31 ByteLane ",ByteLane, 4);
1198
1199                 LastTest = DQS_FAIL;
1200                 RnkDlySeqPassMax = 0;
1201                 RnkDlyFilterMax = 0;
1202                 RnkDlyFilterMin = 0;
1203                 for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
1204                         if(MutualCSPassW[DQSDelay] & (1<<ByteLane)) {
1205
1206                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 321 DQSDelay ", DQSDelay, 5);
1207                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 322 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1208
1209                                 RnkDlySeqPassMax = DQSDelay;
1210                                 if(LastTest == DQS_FAIL) {
1211                                         RnkDlySeqPassMin = DQSDelay; //start sequential run
1212                                 }
1213                                 if((RnkDlySeqPassMax - RnkDlySeqPassMin)>(RnkDlyFilterMax-RnkDlyFilterMin)){
1214                                         RnkDlyFilterMin = RnkDlySeqPassMin;
1215                                         RnkDlyFilterMax = RnkDlySeqPassMax;
1216                                 }
1217                                 LastTest = DQS_PASS;
1218                         }
1219                         else {
1220                                 LastTest = DQS_FAIL;
1221                         }
1222                 }
1223                 print_debug_dqs("\t\t\t\tTrainDQSPos: 33 RnkDlySeqPassMax ", RnkDlySeqPassMax, 4);
1224
1225                 if(RnkDlySeqPassMax == 0) {
1226                         Errors |= SB_NODQSPOS; // no passing window
1227                 }
1228                 else {
1229                         print_debug_dqs("\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMax ", RnkDlyFilterMax, 4);
1230                         print_debug_dqs("\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMin ", RnkDlyFilterMin, 4);
1231                         if((RnkDlyFilterMax - RnkDlyFilterMin)< MIN_DQS_WNDW){
1232                                 Errors |= SB_SMALLDQS;
1233                         }
1234                         else {
1235                                 unsigned middle_dqs;
1236                                 middle_dqs = MiddleDQS(RnkDlyFilterMin, RnkDlyFilterMax);
1237                                 print_debug_dqs("\t\t\t\tTrainDQSPos: 35 middle_dqs ",middle_dqs, 4);
1238                                 SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, middle_dqs);
1239                                 save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, middle_dqs);
1240                         }
1241                 }
1242
1243         }
1244
1245         print_debug_dqs("\t\t\tTrainDQSPos: end", 0xff, 3);
1246
1247         return Errors;
1248
1249
1250 }
1251
1252 static unsigned TrainReadDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1253 {
1254         print_debug_dqs("\t\tTrainReadPos", 0, 2);
1255         return TrainDQSPos(ctrl, channel, DQS_READDIR, pattern, buf_a, dqs_delay_a, sysinfo);
1256 }
1257
1258 static unsigned TrainWriteDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1259 {
1260         print_debug_dqs("\t\tTrainWritePos", 0, 2);
1261         return TrainDQSPos(ctrl, channel, DQS_WRITEDIR, pattern, buf_a, dqs_delay_a, sysinfo);
1262 }
1263
1264
1265
1266 static unsigned TrainDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1267 {
1268         static const uint32_t TestPatternJD1a[] = {
1269                                         0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW0-1, ALL-EVEN
1270                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW2-3, ALL-EVEN
1271                                         0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW4-5, ALL-EVEN
1272                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW6-7, ALL-EVEN
1273                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW0-1, DQ0-ODD
1274                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW2-3, DQ0-ODD
1275                                         0x01010101,0x01010101,0xFeFeFeFe,0xFeFeFeFe, // QW4-5, DQ0-ODD
1276                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW6-7, DQ0-ODD
1277                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW0-1, DQ1-ODD
1278                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2-3, DQ1-ODD
1279                                         0xFdFdFdFd,0xFdFdFdFd,0x02020202,0x02020202, // QW4-5, DQ1-ODD
1280                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW6-7, DQ1-ODD
1281                                         0x04040404,0x04040404,0xfBfBfBfB,0xfBfBfBfB, // QW0-1, DQ2-ODD
1282                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW2-3, DQ2-ODD
1283                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4-5, DQ2-ODD
1284                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6-7, DQ2-ODD
1285                                         0x08080808,0x08080808,0xF7F7F7F7,0xF7F7F7F7, // QW0-1, DQ3-ODD
1286                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW2-3, DQ3-ODD
1287                                         0xF7F7F7F7,0xF7F7F7F7,0x08080808,0x08080808, // QW4-5, DQ3-ODD
1288                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6-7, DQ3-ODD
1289                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW0-1, DQ4-ODD
1290                                         0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW2-3, DQ4-ODD
1291                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4-5, DQ4-ODD
1292                                         0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW6-7, DQ4-ODD
1293                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0-1, DQ5-ODD
1294                                         0xdFdFdFdF,0xdFdFdFdF,0x20202020,0x20202020, // QW2-3, DQ5-ODD
1295                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4-5, DQ5-ODD
1296                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6-7, DQ5-ODD
1297                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0-1, DQ6-ODD
1298                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW2-3, DQ6-ODD
1299                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW4-5, DQ6-ODD
1300                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW6-7, DQ6-ODD
1301                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW0-1, DQ7-ODD
1302                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW2-3, DQ7-ODD
1303                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW4-5, DQ7-ODD
1304                                         0x80808080,0x80808080,0x80808080,0x80808080  // QW6-7, DQ7-ODD
1305                 };
1306         static const uint32_t TestPatternJD1b[] = {
1307                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW0,CHA-B, ALL-EVEN
1308                                         0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW1,CHA-B, ALL-EVEN
1309                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW2,CHA-B, ALL-EVEN
1310                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW3,CHA-B, ALL-EVEN
1311                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW4,CHA-B, ALL-EVEN
1312                                         0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW5,CHA-B, ALL-EVEN
1313                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW6,CHA-B, ALL-EVEN
1314                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW7,CHA-B, ALL-EVEN
1315                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW0,CHA-B, DQ0-ODD
1316                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW1,CHA-B, DQ0-ODD
1317                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW2,CHA-B, DQ0-ODD
1318                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW3,CHA-B, DQ0-ODD
1319                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW4,CHA-B, DQ0-ODD
1320                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW5,CHA-B, DQ0-ODD
1321                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW6,CHA-B, DQ0-ODD
1322                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW7,CHA-B, DQ0-ODD
1323                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW0,CHA-B, DQ1-ODD
1324                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW1,CHA-B, DQ1-ODD
1325                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2,CHA-B, DQ1-ODD
1326                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW3,CHA-B, DQ1-ODD
1327                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW4,CHA-B, DQ1-ODD
1328                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW5,CHA-B, DQ1-ODD
1329                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW6,CHA-B, DQ1-ODD
1330                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW7,CHA-B, DQ1-ODD
1331                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW0,CHA-B, DQ2-ODD
1332                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW1,CHA-B, DQ2-ODD
1333                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW2,CHA-B, DQ2-ODD
1334                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW3,CHA-B, DQ2-ODD
1335                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4,CHA-B, DQ2-ODD
1336                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW5,CHA-B, DQ2-ODD
1337                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6,CHA-B, DQ2-ODD
1338                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW7,CHA-B, DQ2-ODD
1339                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW0,CHA-B, DQ3-ODD
1340                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW1,CHA-B, DQ3-ODD
1341                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW2,CHA-B, DQ3-ODD
1342                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW3,CHA-B, DQ3-ODD
1343                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW4,CHA-B, DQ3-ODD
1344                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW5,CHA-B, DQ3-ODD
1345                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6,CHA-B, DQ3-ODD
1346                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW7,CHA-B, DQ3-ODD
1347                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW0,CHA-B, DQ4-ODD
1348                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW1,CHA-B, DQ4-ODD
1349                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW2,CHA-B, DQ4-ODD
1350                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW3,CHA-B, DQ4-ODD
1351                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4,CHA-B, DQ4-ODD
1352                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW5,CHA-B, DQ4-ODD
1353                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW6,CHA-B, DQ4-ODD
1354                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW7,CHA-B, DQ4-ODD
1355                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0,CHA-B, DQ5-ODD
1356                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW1,CHA-B, DQ5-ODD
1357                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW2,CHA-B, DQ5-ODD
1358                                         0x20202020,0x20202020,0x20202020,0x20202020, // QW3,CHA-B, DQ5-ODD
1359                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4,CHA-B, DQ5-ODD
1360                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW5,CHA-B, DQ5-ODD
1361                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6,CHA-B, DQ5-ODD
1362                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW7,CHA-B, DQ5-ODD
1363                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0,CHA-B, DQ6-ODD
1364                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW1,CHA-B, DQ6-ODD
1365                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW2,CHA-B, DQ6-ODD
1366                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW3,CHA-B, DQ6-ODD
1367                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW4,CHA-B, DQ6-ODD
1368                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW5,CHA-B, DQ6-ODD
1369                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW6,CHA-B, DQ6-ODD
1370                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW7,CHA-B, DQ6-ODD
1371                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW0,CHA-B, DQ7-ODD
1372                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW1,CHA-B, DQ7-ODD
1373                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW2,CHA-B, DQ7-ODD
1374                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW3,CHA-B, DQ7-ODD
1375                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW4,CHA-B, DQ7-ODD
1376                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW5,CHA-B, DQ7-ODD
1377                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW6,CHA-B, DQ7-ODD
1378                                         0x80808080,0x80808080,0x80808080,0x80808080  // QW7,CHA-B, DQ7-ODD
1379                 };
1380         uint8_t pattern_buf_x[64 * 18 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */
1381         uint8_t *buf_a;
1382
1383         unsigned pattern;
1384         uint32_t dword;
1385         uint32_t ecc_bit;
1386         unsigned Errors;
1387         unsigned channel;
1388         int i;
1389         unsigned DQSWrDelay;
1390         unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
1391         uint8_t *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
1392
1393         //enable SSE2
1394         enable_sse2();
1395
1396         //wrap32dis
1397         set_wrap32dis();
1398
1399         //disable ECC temp
1400         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
1401         ecc_bit = dword & DCL_DimmEccEn;
1402         dword &= ~(DCL_DimmEccEn);
1403         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
1404
1405         //SetupDqsPattern
1406         buf_a = (uint8_t *)(((uint32_t)(&pattern_buf_x[0]) + 0x10) & (~0xf));
1407
1408         if(is_Width128){
1409                 pattern = 1;
1410                 for(i=0;i<16*18;i++) {
1411                         *((uint32_t *)(buf_a + i*4)) = TestPatternJD1b[i];
1412                  }
1413         }
1414         else {
1415                 pattern = 0;
1416                 for(i=0; i<16*9;i++) {
1417                         *((uint32_t *)(buf_a + i*4)) = TestPatternJD1a[i];
1418                 }
1419
1420         }
1421
1422         print_debug_dqs("\r\nTrainDQSRdWrPos: 0 ctrl ", ctrl->node_id, 0);
1423
1424         print_debug_addr("TrainDQSRdWrPos: buf_a:", buf_a);
1425
1426         Errors = 0;
1427         channel = 0;
1428
1429         if (!(sysinfo->meminfo[ctrl->node_id].dimm_mask & 0x0F) &&
1430              (sysinfo->meminfo[ctrl->node_id].dimm_mask & 0xF0)) { /* channelB only? */
1431                 channel = 1;
1432         }
1433
1434         while( (channel<2) && (!Errors)) {
1435                 print_debug_dqs("\tTrainDQSRdWrPos: 1 channel ",channel, 1);
1436                 for(DQSWrDelay = 0; DQSWrDelay < 48; DQSWrDelay++) {
1437                         unsigned err;
1438                         SetDQSDelayAllCSR(ctrl, channel, DQS_WRITEDIR, DQSWrDelay);
1439                         print_debug_dqs("\t\tTrainDQSRdWrPos: 21 DQSWrDelay ", DQSWrDelay, 2);
1440                         err= TrainReadDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
1441                         print_debug_dqs("\t\tTrainDQSRdWrPos: 22 err ",err, 2);
1442                         if(err == 0) break;
1443                         Errors |= err;
1444                 }
1445
1446                 print_debug_dqs("\tTrainDQSRdWrPos: 3 DQSWrDelay ", DQSWrDelay, 1);
1447
1448                 if(DQSWrDelay < 48) {
1449                         Errors = TrainWriteDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
1450                         print_debug_dqs("\tTrainDQSRdWrPos: 4 Errors ", Errors, 1);
1451
1452                 }
1453                 channel++;
1454                 if(!is_Width128){
1455                         //FIXME: 64MuxMode??
1456                         channel++; // skip channel if 64-bit mode
1457                 }
1458         }
1459
1460         //Enable ECC again
1461         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
1462         dword &= ~(DCL_DimmEccEn);
1463         dword |= ecc_bit;
1464         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
1465
1466         //Clear wrap32dis
1467
1468         clear_wrap32dis();
1469
1470         //restore SSE2 setting
1471         disable_sse2();
1472
1473         print_debug_dqs("TrainDQSRdWrPos: ", 5, 0);
1474
1475         return Errors;
1476
1477 }
1478 static inline uint8_t get_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, uint8_t *dqs_delay_a)
1479 {
1480         return dqs_delay_a[channel * 2*9 + direction * 9 + bytelane];
1481 }
1482
1483 static unsigned CalcEccDQSPos(unsigned channel,unsigned ByteLane0, unsigned ByteLane1, unsigned InterFactor, unsigned Direction, uint8_t *dqs_delay_a)
1484 /* InterFactor: 0: 100% ByteLane 0
1485                 0x80: 50% between ByteLane 0 and 1
1486                 0xff: 99.6% ByteLane 1 and 0.4% like 0
1487 */
1488 {
1489         unsigned DQSDelay0, DQSDelay1;
1490         unsigned DQSDelay;
1491
1492         DQSDelay0 = get_dqs_delay(channel, ByteLane0, Direction, dqs_delay_a);
1493         DQSDelay1 = get_dqs_delay(channel, ByteLane1, Direction, dqs_delay_a);
1494
1495         if(DQSDelay0>DQSDelay1) {
1496                 DQSDelay = DQSDelay0 - DQSDelay1;
1497                 InterFactor = 0xff - InterFactor;
1498         }
1499         else {
1500                 DQSDelay = DQSDelay1 - DQSDelay0;
1501         }
1502
1503         DQSDelay *= InterFactor;
1504
1505         DQSDelay >>= 8; // /255
1506
1507         if(DQSDelay0>DQSDelay1) {
1508                 DQSDelay += DQSDelay1;
1509         }
1510         else {
1511                 DQSDelay += DQSDelay0;
1512         }
1513
1514         return DQSDelay;
1515
1516 }
1517
1518 static void SetEccDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1519 {
1520         unsigned channel;
1521         unsigned ByteLane;
1522         unsigned Direction;
1523         unsigned lane0, lane1, ratio;
1524         unsigned dqs_delay;
1525
1526         unsigned direction[] = { DQS_READDIR, DQS_WRITEDIR };
1527         int i;
1528         uint8_t *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
1529
1530         ByteLane = 8;
1531
1532         for(channel = 0; channel < 2; channel++) {
1533                 for(i=0;i<2;i++) {
1534                         Direction = direction[i];
1535                         lane0 = 4; lane1 = 5; ratio = 0;
1536                         dqs_delay = CalcEccDQSPos(channel, lane0, lane1, ratio, Direction, dqs_delay_a);
1537                         print_debug_dqs_pair("\t\tSetEccDQSRdWrPos: channel ", channel, Direction==DQS_READDIR? " R dqs_delay":" W dqs_delay",  dqs_delay, 2);
1538                         SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, dqs_delay);
1539                         save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, dqs_delay);
1540                 }
1541         }
1542 }
1543
1544 static unsigned train_DqsRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
1545 {
1546         print_debug_dqs("\r\ntrain_DqsRcvrEn: begin ctrl ", ctrl->node_id, 0);
1547         if(TrainRcvrEn(ctrl, Pass, sysinfo)) {
1548                 return 1;
1549         }
1550         print_debug_dqs("\r\ntrain_DqsRcvrEn: end ctrl ", ctrl->node_id, 0);
1551         return 0;
1552
1553 }
1554 static unsigned train_DqsPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1555 {
1556         print_debug_dqs("\r\ntrain_DqsPos: begin ctrl ", ctrl->node_id, 0);
1557         if(TrainDQSRdWrPos(ctrl, sysinfo) != 0) {
1558                 print_err("\r\nDQS Training Rd Wr failed ctrl"); print_err_hex8(ctrl->node_id); print_err("\r\n");
1559                 return 1;
1560         }
1561         else {
1562                 SetEccDQSRdWrPos(ctrl, sysinfo);
1563         }
1564         print_debug_dqs("\r\ntrain_DqsPos: end ctrl ", ctrl->node_id, 0);
1565         return 0;
1566
1567 }
1568
1569 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1570 static void f0_svm_workaround(int controllers, const struct mem_controller *ctrl, tsc_t *tsc0, struct sys_info *sysinfo)
1571 {
1572         tsc_t tsc1[8];
1573         unsigned cpu_f0_f1[8];
1574         int i;
1575
1576         print_debug_addr("dqs_timing: tsc1[8] :", tsc1);
1577
1578         for(i = 0; i < controllers; i++) {
1579                 if (!sysinfo->ctrl_present[i])
1580                         continue;
1581
1582                 /* Skip everything if I don't have any memory on this controller */
1583                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1584
1585                 uint32_t dword;
1586
1587                 cpu_f0_f1[i] = is_cpu_pre_f2_in_bsp(i);
1588
1589                 if(!cpu_f0_f1[i]) continue;
1590
1591                 dword = pci_read_config32(ctrl[i].f2, DRAM_CTRL);
1592                 dword &= ~DC_DqsRcvEnTrain;
1593                 pci_write_config32(ctrl[i].f2, DRAM_CTRL, dword);
1594
1595                 dword = pci_read_config32(ctrl[i].f2, DRAM_INIT);
1596                 dword |= DI_EnDramInit;
1597                 pci_write_config32(ctrl[i].f2, DRAM_INIT, dword);
1598                 dword &= ~DI_EnDramInit;
1599                 pci_write_config32(ctrl[i].f2, DRAM_INIT, dword);
1600
1601                 tsc1[i] = rdtsc();
1602                 print_debug_dqs_tsc("begin: tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
1603
1604                 dword = tsc1[i].lo + tsc0[i].lo;
1605                 if((dword<tsc1[i].lo) || (dword<tsc0[i].lo)) {
1606                         tsc1[i].hi++;
1607                 }
1608                 tsc1[i].lo = dword;
1609                 tsc1[i].hi+= tsc0[i].hi;
1610
1611                 print_debug_dqs_tsc("end  : tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
1612
1613         }
1614
1615         for(i = 0; i < controllers; i++) {
1616                 if (!sysinfo->ctrl_present[i])
1617                         continue;
1618
1619                 /* Skip everything if I don't have any memory on this controller */
1620                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1621
1622                 if(!cpu_f0_f1[i]) continue;
1623
1624                 tsc_t tsc;
1625
1626                 do {
1627                         tsc = rdtsc();
1628                 } while ((tsc1[i].hi>tsc.hi) || ((tsc1[i].hi==tsc.hi) && (tsc1[i].lo>tsc.lo)));
1629
1630                 print_debug_dqs_tsc("end  : tsc ", i, tsc.hi, tsc.lo, 2);
1631         }
1632
1633 }
1634
1635 #endif
1636
1637
1638 /* setting variable mtrr, comes from linux kernel source */
1639 static void set_var_mtrr_dqs(
1640         unsigned int reg, unsigned long basek, unsigned long sizek,
1641         unsigned char type, unsigned address_bits)
1642 {
1643         msr_t base, mask;
1644         unsigned address_mask_high;
1645
1646         address_mask_high = ((1u << (address_bits - 32u)) - 1u);
1647
1648         base.hi = basek >> 22;
1649         base.lo  = basek << 10;
1650
1651         if (sizek < 4*1024*1024) {
1652                 mask.hi = address_mask_high;
1653                 mask.lo = ~((sizek << 10) -1);
1654         }
1655         else {
1656                 mask.hi = address_mask_high & (~((sizek >> 22) -1));
1657                 mask.lo = 0;
1658         }
1659
1660         if (reg >= 8)
1661                 return;
1662
1663         if (sizek == 0) {
1664                 msr_t zero;
1665                 zero.lo = zero.hi = 0;
1666                 /* The invalid bit is kept in the mask, so we simply clear the
1667                    relevant mask register to disable a range. */
1668                 wrmsr (MTRRphysMask_MSR(reg), zero);
1669         } else {
1670                 /* Bit 32-35 of MTRRphysMask should be set to 1 */
1671                 base.lo |= type;
1672                 mask.lo |= 0x800;
1673                 wrmsr (MTRRphysBase_MSR(reg), base);
1674                 wrmsr (MTRRphysMask_MSR(reg), mask);
1675         }
1676 }
1677
1678
1679 /* fms: find most sigificant bit set, stolen from Linux Kernel Source. */
1680 static inline unsigned int fms(unsigned int x)
1681 {
1682         int r;
1683
1684         __asm__("bsrl %1,%0\n\t"
1685                 "jnz 1f\n\t"
1686                 "movl $0,%0\n"
1687                 "1:" : "=r" (r) : "g" (x));
1688         return r;
1689 }
1690
1691 /* fms: find least sigificant bit set */
1692 static inline unsigned int fls(unsigned int x)
1693 {
1694         int r;
1695
1696         __asm__("bsfl %1,%0\n\t"
1697                 "jnz 1f\n\t"
1698                 "movl $32,%0\n"
1699                 "1:" : "=r" (r) : "g" (x));
1700         return r;
1701 }
1702
1703 static unsigned int range_to_mtrr(unsigned int reg,
1704         unsigned long range_startk, unsigned long range_sizek,
1705         unsigned long next_range_startk, unsigned char type, unsigned address_bits)
1706 {
1707         if (!range_sizek || (reg >= 8)) {
1708                 return reg;
1709         }
1710         while(range_sizek) {
1711                 unsigned long max_align, align;
1712                 unsigned long sizek;
1713                 /* Compute the maximum size I can make a range */
1714                 max_align = fls(range_startk);
1715                 align = fms(range_sizek);
1716                 if (align > max_align) {
1717                         align = max_align;
1718                 }
1719                 sizek = 1 << align;
1720 #if MEM_TRAIN_SEQ != 1
1721         #if CONFIG_USE_PRINTK_IN_CAR
1722                 printk_debug("Setting variable MTRR %d, base: %4dMB, range: %4dMB, type %s\r\n",
1723                         reg, range_startk >>10, sizek >> 10,
1724                         (type==MTRR_TYPE_UNCACHEABLE)?"UC":
1725                             ((type==MTRR_TYPE_WRBACK)?"WB":"Other")
1726                         );
1727         #else
1728                 print_debug("Setting variable MTRR "); print_debug_hex8(reg); print_debug(", base: "); print_debug_hex16(range_startk>>10);
1729                         print_debug("MB, range: "); print_debug_hex16(sizek >> 10); print_debug("MB, type ");
1730                         print_debug( (type==MTRR_TYPE_UNCACHEABLE)?"UC\r\n":
1731                                       ((type==MTRR_TYPE_WRBACK)?"WB\r\n":"Other\r\n")
1732                                    );
1733         #endif
1734 #endif
1735                 set_var_mtrr_dqs(reg++, range_startk, sizek, type, address_bits);
1736                 range_startk += sizek;
1737                 range_sizek -= sizek;
1738                 if (reg >= 8)
1739                         break;
1740         }
1741         return reg;
1742 }
1743
1744 static void set_top_mem_ap(unsigned tom_k, unsigned tom2_k)
1745 {
1746         msr_t msr;
1747
1748         /* Now set top of memory */
1749         msr.lo = (tom2_k & 0x003fffff) << 10;
1750         msr.hi = (tom2_k & 0xffc00000) >> 22;
1751         wrmsr(TOP_MEM2, msr);
1752
1753         msr.lo = (tom_k & 0x003fffff) << 10;
1754         msr.hi = (tom_k & 0xffc00000) >> 22;
1755         wrmsr(TOP_MEM, msr);
1756 }
1757
1758 static void setup_mtrr_dqs(unsigned tom_k, unsigned tom2_k){
1759         unsigned reg;
1760         msr_t msr;
1761
1762 #if 0
1763         //still enable from cache_as_ram.inc
1764         msr = rdmsr(SYSCFG_MSR);
1765         msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
1766         wrmsr(SYSCFG_MSR,msr);
1767 #endif
1768
1769         //[0,512k), [512k, 640k)
1770         msr.hi = 0x1e1e1e1e;
1771         msr.lo = msr.hi;
1772         wrmsr(0x250, msr);
1773         wrmsr(0x258, msr);
1774
1775         //[1M, TOM)
1776         reg = range_to_mtrr(2, 0, tom_k,4*1024*1024, MTRR_TYPE_WRBACK, 40);
1777
1778         //[4G, TOM2)
1779         if(tom2_k) {
1780                 //enable tom2 and type
1781                 msr = rdmsr(SYSCFG_MSR);
1782                 msr.lo |= (1<<21) | (1<<22); //MtrrTom2En and Tom2ForceMemTypeWB
1783                 wrmsr(SYSCFG_MSR, msr);
1784         }
1785
1786 }
1787
1788 static void clear_mtrr_dqs(unsigned tom2_k){
1789         msr_t msr;
1790         unsigned i;
1791
1792         //still enable from cache_as_ram.inc
1793         msr = rdmsr(SYSCFG_MSR);
1794         msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
1795         wrmsr(SYSCFG_MSR,msr);
1796
1797         //[0,512k), [512k, 640k)
1798         msr.hi = 0;
1799         msr.lo = msr.hi;
1800         wrmsr(0x250, msr);
1801         wrmsr(0x258, msr);
1802
1803         //[1M, TOM)
1804         for(i=0x204;i<0x210;i++) {
1805                 wrmsr(i, msr);
1806         }
1807
1808         //[4G, TOM2)
1809         if(tom2_k) {
1810                 //enable tom2 and type
1811                 msr = rdmsr(SYSCFG_MSR);
1812                 msr.lo &= ~((1<<21) | (1<<22)); //MtrrTom2En and Tom2ForceMemTypeWB
1813                 wrmsr(SYSCFG_MSR, msr);
1814         }
1815 }
1816
1817 static void set_htic_bit(unsigned i, unsigned val, unsigned bit)
1818 {
1819         uint32_t dword;
1820         dword = pci_read_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL);
1821         dword &= ~(1<<bit);
1822         dword |= ((val & 1) <<bit);
1823         pci_write_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL, dword);
1824 }
1825
1826
1827 static unsigned get_htic_bit(unsigned i, unsigned bit)
1828 {
1829         uint32_t dword;
1830         dword = pci_read_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL);
1831         dword &= (1<<bit);
1832         return dword;
1833 }
1834
1835 static void wait_till_sysinfo_in_ram(void)
1836 {
1837         while(1) {
1838                 if(get_htic_bit(0, 9)) return;
1839         }
1840 }
1841
1842 static void set_sysinfo_in_ram(unsigned val)
1843 {
1844         set_htic_bit(0, val, 9);
1845 }
1846
1847
1848 #if MEM_TRAIN_SEQ == 0
1849
1850
1851 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1852 static void dqs_timing(int controllers, const struct mem_controller *ctrl, tsc_t *tsc0, struct sys_info *sysinfo)
1853 #else
1854 static void dqs_timing(int controllers, const struct mem_controller *ctrl, struct sys_info *sysinfo)
1855 #endif
1856 {
1857         int  i;
1858
1859         tsc_t tsc[5];
1860
1861         //need to enable mtrr, so dqs training could access the test address
1862         setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
1863
1864         for(i = 0; i < controllers; i++) {
1865                 if (!sysinfo->ctrl_present[ i ])
1866                         continue;
1867
1868                 /* Skip everything if I don't have any memory on this controller */
1869                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1870
1871                 fill_mem_cs_sysinfo(i, ctrl+i, sysinfo);
1872         }
1873
1874         tsc[0] = rdtsc();
1875         for(i = 0; i < controllers; i++) {
1876                 if (!sysinfo->ctrl_present[ i ])
1877                         continue;
1878
1879                 /* Skip everything if I don't have any memory on this controller */
1880                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1881
1882                 print_debug("DQS Training:RcvrEn:Pass1: ");
1883                 print_debug_hex8(i);
1884                 if(train_DqsRcvrEn(ctrl+i, 1, sysinfo)) goto out;
1885                 print_debug(" done\r\n");
1886         }
1887
1888         tsc[1] = rdtsc();
1889 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1890         f0_svm_workaround(controllers, ctrl, tsc0, sysinfo);
1891 #endif
1892
1893         tsc[2] = rdtsc();
1894         for(i = 0; i < controllers; i++) {
1895                 if (!sysinfo->ctrl_present[i])
1896                         continue;
1897
1898                 /* Skip everything if I don't have any memory on this controller */
1899                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1900
1901                 print_debug("DQS Training:DQSPos: ");
1902                 print_debug_hex8(i);
1903                 if(train_DqsPos(ctrl+i, sysinfo)) goto out;
1904                 print_debug(" done\r\n");
1905         }
1906
1907         tsc[3] = rdtsc();
1908         for(i = 0; i < controllers; i++) {
1909                 if (!sysinfo->ctrl_present[i])
1910                         continue;
1911
1912                 /* Skip everything if I don't have any memory on this controller */
1913                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1914
1915                 print_debug("DQS Training:RcvrEn:Pass2: ");
1916                 print_debug_hex8(i);
1917                 if(train_DqsRcvrEn(ctrl+i, 2, sysinfo)) goto out;
1918                 print_debug(" done\r\n");
1919                 sysinfo->mem_trained[i]=1;
1920         }
1921
1922 out:
1923         tsc[4] = rdtsc();
1924         clear_mtrr_dqs(sysinfo->tom2_k);
1925
1926
1927         for(i=0;i<5;i++) {
1928                 print_debug_dqs_tsc_x("DQS Training:tsc", i,  tsc[i].hi, tsc[i].lo);
1929         }
1930
1931
1932
1933 }
1934
1935 #endif
1936
1937
1938 #if MEM_TRAIN_SEQ > 0
1939
1940 static void dqs_timing(int i, const struct mem_controller *ctrl, struct sys_info *sysinfo, unsigned v)
1941 {
1942
1943         int ii;
1944
1945          tsc_t tsc[4];
1946
1947         if(sysinfo->mem_trained[i] != 0x80) return;
1948
1949 #if MEM_TRAIN_SEQ == 1
1950         //need to enable mtrr, so dqs training could access the test address
1951         setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
1952 #endif
1953
1954         fill_mem_cs_sysinfo(i, ctrl, sysinfo);
1955
1956         if(v) {
1957                 tsc[0] = rdtsc();
1958
1959                 print_debug("set DQS timing:RcvrEn:Pass1: ");
1960                 print_debug_hex8(i);
1961         }
1962         if(train_DqsRcvrEn(ctrl, 1,  sysinfo)) {
1963                 sysinfo->mem_trained[i]=0x81; //
1964                 goto out;
1965         }
1966
1967         if(v) {
1968                 print_debug(" done\r\n");
1969                 tsc[1] = rdtsc();
1970                 print_debug("set DQS timing:DQSPos: ");
1971                 print_debug_hex8(i);
1972         }
1973
1974         if(train_DqsPos(ctrl, sysinfo)) {
1975                 sysinfo->mem_trained[i]=0x82; //
1976                 goto out;
1977         }
1978
1979         if(v) {
1980                 print_debug(" done\r\n");
1981                 tsc[2] = rdtsc();
1982
1983                 print_debug("set DQS timing:RcvrEn:Pass2: ");
1984                 print_debug_hex8(i);
1985         }
1986         if(train_DqsRcvrEn(ctrl, 2,  sysinfo)){
1987                 sysinfo->mem_trained[i]=0x83; //
1988                 goto out;
1989         }
1990
1991         if(v) {
1992                 print_debug(" done\r\n");
1993
1994                 tsc[3] = rdtsc();
1995         }
1996
1997 out:
1998 #if MEM_TRAIN_SEQ == 1
1999         clear_mtrr_dqs(sysinfo->tom2_k);
2000 #endif
2001
2002         if(v) {
2003                 for(ii=0;ii<4;ii++) {
2004                       print_debug_dqs_tsc_x("Total DQS Training : tsc ", ii,  tsc[ii].hi, tsc[ii].lo);
2005                 }
2006         }
2007
2008         if(sysinfo->mem_trained[i] == 0x80) {
2009                 sysinfo->mem_trained[i]=1;
2010         }
2011
2012 }
2013 #endif
2014
2015 #if MEM_TRAIN_SEQ == 1
2016 static void train_ram(unsigned nodeid, struct sys_info *sysinfo, struct sys_info *sysinfox)
2017 {
2018         dqs_timing(nodeid, &sysinfo->ctrl[nodeid], sysinfo, 0); // keep the output tidy
2019 //      memcpy(&sysinfox->dqs_rcvr_dly_a[nodeid * 2 * 8],&sysinfo->dqs_rcvr_dly_a[nodeid * 2 * 8], 2*8);
2020 //      memcpy(&sysinfox->dqs_delay_a[nodeid * 2 * 2 * 9], &sysinfo->dqs_delay_a[nodeid * 2 * 2 * 9], 2 * 2 * 9);
2021         sysinfox->mem_trained[nodeid] = sysinfo->mem_trained[nodeid];
2022
2023 }
2024 static void copy_and_run_ap_code_in_car(unsigned ret_addr);
2025 static inline void train_ram_on_node(unsigned nodeid, unsigned coreid, struct sys_info *sysinfo, unsigned retcall)
2026 {
2027         if(coreid) return; // only do it on core0
2028         struct sys_info *sysinfox = ((CONFIG_LB_MEM_TOPK<<10) - DCACHE_RAM_GLOBAL_VAR_SIZE);
2029         wait_till_sysinfo_in_ram(); // use pci to get it
2030
2031         if(sysinfox->mem_trained[nodeid] == 0x80) {
2032         #if 0
2033                 sysinfo->tom_k = sysinfox->tom_k;
2034                 sysinfo->tom2_k = sysinfox->tom2_k;
2035                 sysinfo->meminfo[nodeid].is_Width128 = sysinfox->meminfo[nodeid].is_Width128;
2036                 sysinfo->mem_trained[nodeid] = sysinfox->mem_trained[nodeid];
2037                 memcpy(&sysinfo->ctrl[nodeid], &sysinfox->ctrl[nodeid], sizeof(struct mem_controller));
2038         #else
2039                 memcpy(sysinfo, sysinfox, DCACHE_RAM_GLOBAL_VAR_SIZE);
2040         #endif
2041                 set_top_mem_ap(sysinfo->tom_k, sysinfo->tom2_k); // keep the ap's tom consistent with bsp's
2042         #if CONFIG_AP_CODE_IN_CAR == 0
2043                 print_debug("CODE IN ROM AND RUN ON NODE:"); print_debug_hex8(nodeid); print_debug("\r\n");
2044                 train_ram(nodeid, sysinfo, sysinfox);
2045         #else
2046                 /* Can copy dqs_timing to ap cache and run from cache?
2047                 * we need coreboot_ap_car.rom? and treat it as coreboot_ram.rom for ap ?
2048                 */
2049                 copy_and_run_ap_code_in_car(retcall);
2050                 // will go back by jump
2051         #endif
2052         }
2053 }
2054 #endif