1b5af6864f1ae338a63cd66fefd5d5dca902d334
[coreboot.git] / src / northbridge / amd / amdk8 / raminit_f_dqs.c
1 /*
2         yhlu 2005.10 dqs training
3 */
4 //0: mean no debug info
5 #define DQS_TRAIN_DEBUG 0
6
7 static inline void print_debug_dqs(const char *str, unsigned val, unsigned level) 
8 {
9 #if DQS_TRAIN_DEBUG > 0
10         if(DQS_TRAIN_DEBUG > level) {
11                 #if CONFIG_USE_PRINTK_IN_CAR
12                 printk_debug("%s%x\r\n", str, val);
13                 #else
14                 print_debug(str); print_debug_hex32(val); print_debug("\r\n");
15                 #endif
16         }
17 #endif
18 }
19
20 static inline void print_debug_dqs_pair(const char *str, unsigned val, const char *str2, unsigned val2, unsigned level)
21 {
22 #if DQS_TRAIN_DEBUG > 0
23         if(DQS_TRAIN_DEBUG > level) {
24                 #if CONFIG_USE_PRINTK_IN_CAR
25                 printk_debug("%s%08x%s%08x\r\n", str, val, str2, val2);
26                 #else
27                 print_debug(str); print_debug_hex32(val); print_debug(str2); print_debug_hex32(val2); print_debug("\r\n");
28                 #endif
29         }
30 #endif
31 }
32
33 static inline void print_debug_dqs_tsc(const char *str, unsigned i, unsigned val, unsigned val2, unsigned level)
34 {
35 #if DQS_TRAIN_DEBUG > 0
36         if(DQS_TRAIN_DEBUG > level) {
37                 #if CONFIG_USE_PRINTK_IN_CAR
38                 printk_debug("%s[%02x]=%08x%08x\r\n", str, i, val, val2);
39                 #else
40                 print_debug(str); print_debug("["); print_debug_hex8(i); print_debug("]="); print_debug_hex32(val); print_debug_hex32(val2); print_debug("\r\n");
41                 #endif
42         }
43 #endif
44 }
45
46 static inline void print_debug_dqs_tsc_x(const char *str, unsigned i, unsigned val, unsigned val2)
47 {
48         #if CONFIG_USE_PRINTK_IN_CAR
49         printk_debug("%s[%02x]=%08x%08x\r\n", str, i, val, val2);
50         #else
51         print_debug(str); print_debug("["); print_debug_hex8(i); print_debug("]="); print_debug_hex32(val); print_debug_hex32(val2); print_debug("\r\n");
52         #endif
53
54 }
55
56 static void fill_mem_cs_sysinfo(unsigned nodeid, const struct mem_controller *ctrl, struct sys_info *sysinfo)
57 {
58
59         int i;
60         sysinfo->mem_base[nodeid] = pci_read_config32(ctrl->f1, 0x40 + (nodeid<<3));
61
62         for(i=0;i<8; i++) {
63                 sysinfo->cs_base[nodeid*8+i] = pci_read_config32(ctrl->f2, 0x40 + (i<<2));
64         }
65
66         sysinfo->hole_reg[nodeid] = pci_read_config32(ctrl->f1, 0xf0);  
67
68 }
69 static unsigned Get_MCTSysAddr(const struct mem_controller *ctrl,  unsigned cs_idx, struct sys_info *sysinfo)
70 {
71         uint32_t dword;
72         uint32_t mem_base;
73         unsigned nodeid = ctrl->node_id;
74
75 #if HW_MEM_HOLE_SIZEK != 0      
76         uint32_t hole_reg;
77 #endif
78
79         //get the local base addr of the chipselect
80         dword = sysinfo->cs_base[nodeid * 8 + cs_idx];
81         dword &= 0xfffffff0;
82
83         //sys addr= node base + local cs base
84         mem_base = sysinfo->mem_base[nodeid];
85         mem_base &= 0xffff0000;
86
87         dword += mem_base;
88 #if HW_MEM_HOLE_SIZEK != 0
89         hole_reg = sysinfo->hole_reg[nodeid];
90         if(hole_reg & 1) {
91                 unsigned hole_startk;
92                 hole_startk = (hole_reg & (0xff<<24)) >> 10;
93                 if( (dword >= (hole_startk<<2)) && (dword < ((4*1024*1024)<<2))) { 
94                         dword += ((4*1024*1024 - hole_startk)<<2);
95                 }
96         }  
97 #endif
98
99         //add 1MB offset to avoid compat area
100         dword += (1<<(20-8));
101                 
102         //So final result is upper 32 bit addr 
103         
104         return dword;
105
106 }
107
108 static unsigned Get_RcvrSysAddr(const struct mem_controller * ctrl, unsigned channel, unsigned cs_idx, struct sys_info *sysinfo)
109 {
110         return Get_MCTSysAddr(ctrl, cs_idx, sysinfo);
111
112 }
113
114 static inline unsigned long read_cr4(void)
115 {
116         unsigned long cr4;
117         asm volatile ("movl %%cr4, %0" : "=r" (cr4));
118         return cr4;
119 }
120
121 static inline void write_cr4(unsigned long cr4)
122 {
123         asm volatile ("movl %0, %%cr4" : : "r" (cr4));
124 }
125
126
127 static inline void enable_sse2()
128 {
129         unsigned long cr4;
130         cr4 = read_cr4();
131         cr4 |= (1<<9);
132         write_cr4(cr4);
133 }
134
135 static inline void disable_sse2()
136 {
137         unsigned long cr4;
138         cr4 = read_cr4();
139         cr4 &= ~(1<<9);
140         write_cr4(cr4);
141 }
142
143
144 static void set_wrap32dis(void) {
145         msr_t msr;
146         
147         msr = rdmsr(0xc0010015);
148         msr.lo |= (1<<17);
149         
150         wrmsr(0xc0010015, msr);
151
152 }
153
154 static void clear_wrap32dis(void) {
155         msr_t msr;
156
157         msr = rdmsr(0xc0010015);
158         msr.lo &= ~(1<<17);
159
160         wrmsr(0xc0010015, msr);
161
162 }
163
164 static void set_FSBASE(uint32_t addr_hi)
165 {
166         msr_t msr;
167
168         //set fs and use fs prefix to access the mem
169         msr.hi = addr_hi;
170         msr.lo = 0;
171         wrmsr(0xc0000100, msr); //FS_BASE
172
173 }
174
175 static unsigned ChipSelPresent(const struct mem_controller *ctrl, unsigned cs_idx, struct sys_info *sysinfo)
176 {
177         unsigned enabled;
178         unsigned nodeid = ctrl->node_id;
179         
180
181         enabled = sysinfo->cs_base[nodeid * 8 + cs_idx];
182         enabled &= 1;
183
184         return enabled;
185
186 }
187
188 static unsigned RcvrRankEnabled(const struct mem_controller *ctrl, int channel, int cs_idx, unsigned is_Width128, struct sys_info *sysinfo)
189 {
190         /* FIXME: process 64Muxed */
191         if(!is_Width128) {
192                 if(channel) return 0; // no channel b
193         }
194
195         return ChipSelPresent(ctrl, cs_idx, sysinfo);
196 }
197
198 static void WriteLNTestPattern(unsigned addr_lo, uint8_t *buf_a, unsigned line_num)
199 {
200         __asm__ volatile (
201                 "1:\n\t"
202                 "movdqa (%3), %%xmm0\n\t"
203                 "movntdq %%xmm0, %%fs:(%0)\n\t" /* xmm0 is 128 bit */
204                 "addl %1, %0\n\t"
205                 "addl %1, %3\n\t"
206                 "loop 1b\n\t"
207
208                 :: "a" (addr_lo), "d" (16), "c" (line_num * 4), "b"(buf_a)
209         );
210
211
212 }
213
214 static void Write1LTestPattern(unsigned addr, unsigned p, uint8_t *buf_a, uint8_t *buf_b) 
215 {
216         uint8_t *buf;
217         if(p==1) { buf = buf_b; }
218         else { buf = buf_a; }
219
220         set_FSBASE (addr>>24);
221
222         WriteLNTestPattern(addr<<8, buf, 1);
223 }
224
225 static void Read1LTestPattern(unsigned addr) 
226 {
227         unsigned value;
228
229         set_FSBASE(addr>>24);
230         
231         /* 1st move causes read fill (to exclusive or shared)*/
232         __asm__ volatile (
233                 "movl %%fs:(%1), %0\n\t"
234                 :"=b"(value): "a" (addr<<8)
235         );
236         
237 }
238
239 #define DQS_PASS 0
240 #define DQS_FAIL 1
241
242 #define DQS_FIRST_PASS 1
243 #define DQS_SECOND_PASS 2
244
245 #define SB_NORCVREN 11
246 #define RCVREN_MARGIN 6
247 #define SB_SmallRCVR 13
248 #define SB_CHA2BRCVREN 12
249 #define SB_NODQSPOS  14
250 #define MIN_DQS_WNDW 3
251 #define SB_SMALLDQS 15
252
253
254 static unsigned CompareTestPatternQW0(unsigned channel, unsigned addr, unsigned pattern, const uint32_t *TestPattern0, const uint32_t *TestPattern1, const uint32_t *TestPattern2, unsigned Pass, unsigned is_Width128)
255 {
256         uint32_t addr_lo;
257         uint32_t *test_buf;
258         uint32_t value;
259         uint32_t value_test;
260         unsigned result = DQS_FAIL;
261
262         if(Pass == DQS_FIRST_PASS) {
263                 if(pattern==1) {
264                         test_buf = (uint32_t *)TestPattern1;
265                 }
266                 else {
267                         test_buf = (uint32_t *)TestPattern0;
268                 }
269         }
270         else {
271                 test_buf = (uint32_t *)TestPattern2;
272         }
273
274         set_FSBASE(addr>>24);   
275         
276         addr_lo = addr<<8;
277         
278         if(is_Width128 && (channel == 1)) {
279                 addr_lo += 8; //second channel
280                 test_buf += 2;
281         }
282         
283         __asm__ volatile (
284                 "movl %%fs:(%1), %0\n\t"
285                 :"=b"(value): "a" (addr_lo)
286         );
287
288         value_test = *test_buf;
289
290         
291         print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4); 
292         print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : addr_lo = ", addr_lo, " value = ", value, 4); 
293
294         if(value == value_test) {
295                 addr_lo += 4;
296                 test_buf++;
297                 __asm__ volatile (
298                         "movl %%fs:(%1), %0\n\t"
299                         :"=b"(value): "a" (addr_lo)
300                 );
301                 value_test = *test_buf;
302                 print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4);
303                 print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : addr_lo = ", addr_lo, " value = ", value, 4);
304
305                 if(value == value_test){
306                         result =  DQS_PASS;
307                 }
308         }
309         
310         if(Pass == DQS_SECOND_PASS) { // second pass need to be inverted
311                 if(result==DQS_PASS) {
312                         result = DQS_FAIL;
313                 }
314                 else {
315                         result = DQS_PASS;
316                 }
317         }
318
319         return result;
320
321 }
322
323 static void SetMaxAL_RcvrDly(const struct mem_controller *ctrl, unsigned dly) 
324 {
325         uint32_t reg;
326
327         dly += (20-1); // round it
328         dly /= 20; // convert from unit 50ps to 1ns
329         
330         dly += 6;
331
332
333         reg = pci_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
334         reg &= ~(DCH_MaxAsyncLat_MASK <<DCH_MaxAsyncLat_SHIFT);
335         reg |= ((dly - DCH_MaxAsyncLat_BASE) << DCH_MaxAsyncLat_SHIFT);
336         pci_write_config32(ctrl->f2, DRAM_CONFIG_HIGH, reg);
337         
338 }
339
340 /*
341         Set the Target range to WT IO (using an IORR overlapping the already existing 
342         WB dram type). Use IORR0
343 */
344 static void SetTargetWTIO(unsigned addr)
345 {
346         msr_t msr;
347         msr.hi = addr>>24;
348         msr.lo = addr<<8;
349         wrmsr(0xc0010016, msr); //IORR0 BASE
350         
351         msr.hi = 0xff;
352         msr.lo = 0xfc000800;  // 64MB Mask
353         wrmsr(0xc0010017, msr); // IORR0 Mask 
354 }
355
356 static void ResetTargetWTIO(void)
357 {
358         msr_t msr;
359
360         msr.hi = 0;
361         msr.lo = 0;  
362         wrmsr(0xc0010017, msr); // IORR0 Mask
363 }
364
365 static void proc_CLFLUSH(unsigned addr)
366 {
367
368         set_FSBASE(addr>>24);
369
370         /* 1st move causes read fill (to exclusive or shared)*/
371         __asm__ volatile (
372                         /* clflush fs:[eax] */
373                 "clflush %%fs:(%0)\n\t"
374                 ::"a" (addr<<8)
375         );
376         
377 }
378 static void proc_IOCLFLUSH(unsigned addr)
379 {
380         SetTargetWTIO(addr);
381         proc_CLFLUSH(addr);
382         ResetTargetWTIO();
383 }
384
385 static void ResetDCTWrPtr(const struct mem_controller *ctrl)
386 {
387         uint32_t dword;
388         unsigned index = 0x10;
389         
390         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
391         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
392
393 }
394
395
396 static uint16_t get_exact_T1000(unsigned i)
397 {
398         //                                 200   266,   333,  400
399         static const uint16_t T1000_a[]= { 5000, 3759, 3003, 2500 };
400
401         static const uint16_t TT_a[] = {
402                  /*200   266   333   400 */
403          /*4 */   6250, 6250, 6250, 6250,
404          /*5 */   5000, 5000, 5000, 2500,
405          /*6 */   5000, 4166, 4166, 2500,
406          /*7 */   5000, 4285, 3571, 2500,
407
408          /*8 */   5000, 3750, 3125, 2500,
409          /*9 */   5000, 3888, 3333, 2500,
410          /*10*/   5000, 4000, 3000, 2500,
411          /*11*/   5000, 4090, 3181, 2500,
412
413          /*12*/   5000, 3750, 3333, 2500,
414          /*13*/   5000, 3846, 3076, 2500,
415          /*14*/   5000, 3928, 3214, 2500,
416          /*15*/   5000, 4000, 3000, 2500,
417         };
418
419         unsigned fid_cur;
420         int index;
421
422         msr_t msr;
423         msr = rdmsr(0xc0010042);
424         fid_cur = msr.lo & 0x3f;
425
426         index = fid_cur>>1;
427
428         if(index>12) return T1000_a[i];
429
430         return TT_a[index * 4+i];
431
432 }
433
434 static void InitDQSPos4RcvrEn(const struct mem_controller *ctrl)
435 {
436         int i;
437         uint32_t dword;
438         
439         dword = 0x00000000;
440         for(i=1; i<=3; i++) {
441                 /* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x01-0x03, 0x21-0x23) to 0x00 for all bytes */
442                 pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
443                 pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
444         }
445
446         dword = 0x2f2f2f2f;
447         for(i=5; i<=7; i++) {
448                 /* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x05-0x07, 0x25-0x27) to 0x2f for all bytes */
449                 pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
450                 pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
451         }
452
453
454 }
455 #ifndef K8_REV_F_SUPPORT_F0_F1_WORKAROUND 
456 #define K8_REV_F_SUPPORT_F0_F1_WORKAROUND 1
457 #endif
458
459 static unsigned TrainRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
460 {
461
462         static const uint32_t TestPattern0[] = {
463                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
464                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
465                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
466                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
467                 };
468         static const uint32_t TestPattern1[] = {
469                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
470                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
471                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
472                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
473                 };
474         static const uint32_t TestPattern2[] = { 
475                         0x12345678, 0x87654321, 0x23456789, 0x98765432,
476                         0x59385824, 0x30496724, 0x24490795, 0x99938733,
477                         0x40385642, 0x38465245, 0x29432163, 0x05067894,
478                         0x12349045, 0x98723467, 0x12387634, 0x34587623,
479                 };
480
481         uint8_t pattern_buf_x[64 * 4 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */ 
482         uint8_t *buf_a, *buf_b; 
483         uint32_t ecc_bit;
484         uint32_t dword;
485         uint8_t *dqs_rcvr_dly_a = &sysinfo->dqs_rcvr_dly_a[ctrl->node_id * 2* 8] ; //8 node, channel 2, receiver 8
486
487         int i;
488
489         unsigned channel, receiver;
490
491         unsigned Errors;
492         unsigned CTLRMaxDelay;
493         unsigned T1000;
494
495         unsigned LastTest;
496         unsigned CurrTest;
497         unsigned Test0, Test1;
498
499         unsigned RcvrEnDlyRmin;
500
501         unsigned two_ranks;
502         unsigned RcvrEnDly;
503
504         unsigned PatternA;
505         unsigned PatternB;
506
507         unsigned TestAddr0, TestAddr0B, TestAddr1, TestAddr1B;
508
509         unsigned CurrRcvrCHADelay;
510
511         unsigned tmp;
512
513         unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
514
515         unsigned cpu_f0_f1;
516
517         if(Pass == DQS_FIRST_PASS) {
518                 InitDQSPos4RcvrEn(ctrl);
519         }
520
521         //enable SSE2
522         enable_sse2();
523
524         //wrap32dis
525         set_wrap32dis();
526
527         //disable ECC temp
528         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
529         ecc_bit = dword & DCL_DimmEccEn;
530         dword &= ~(DCL_DimmEccEn); 
531         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
532
533
534         if(Pass == DQS_FIRST_PASS) {
535 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
536         cpu_f0_f1 = is_cpu_pre_f2_in_bsp(ctrl->node_id);
537         if(!cpu_f0_f1) 
538 #endif
539         {
540 #if 1
541                 /* Set the DqsRcvEnTrain bit */
542                 dword = pci_read_config32(ctrl->f2, DRAM_CTRL);
543                 dword |= DC_DqsRcvEnTrain;
544                 pci_write_config32(ctrl->f2, DRAM_CTRL, dword);
545 #endif
546         }
547         }
548
549         //get T1000 figures (cycle time (ns)) * 1K
550         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
551         dword &= DCH_MemClkFreq_MASK;
552
553         T1000 = get_exact_T1000(dword); 
554
555         // SetupRcvrPattern 
556         buf_a = (uint8_t *)(((uint32_t)(&pattern_buf_x[0]) + 0x10) & (0xfffffff0));
557         buf_b = buf_a + 128; //??
558         if(Pass==DQS_FIRST_PASS) {
559                 for(i=0;i<16;i++) {
560                         *((uint32_t *)(buf_a + i*4)) = TestPattern0[i];
561                         *((uint32_t *)(buf_b + i*4)) = TestPattern1[i];
562                 }
563         }
564         else {
565                 for(i=0;i<16;i++) {
566                         *((uint32_t *)(buf_a + i*4)) = TestPattern2[i];
567                         *((uint32_t *)(buf_b + i*4)) = TestPattern2[i];
568                 }
569         }
570
571         print_debug_dqs("\r\nTrainRcvEn: 0 ctrl", ctrl->node_id, 0);
572
573         print_debug_addr("TrainRcvEn: buf_a:", buf_a); 
574
575         Errors = 0;
576         /* for each channel */
577         CTLRMaxDelay = 0;
578         for(channel = 0; (channel < 2) && (!Errors); channel++) 
579         { 
580                 print_debug_dqs("\tTrainRcvEn51: channel ",channel, 1); 
581                 
582                 /* for each rank */ 
583                 /* there are four recriver pairs, loosely associated with CS */ 
584                 for( receiver = 0; (receiver < 8) && (!Errors); receiver+=2) 
585                 {
586                         
587                         unsigned index=(receiver>>1) * 3 + 0x10;
588
589                         print_debug_dqs("\t\tTrainRcvEn52: index ", index, 2); 
590
591                         if(is_Width128) {
592                                 if(channel) {
593                                         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
594                                         CurrRcvrCHADelay= dword & 0xff;
595                                 }
596                         }
597                         else {
598                                 if(channel) { 
599                                         index += 0x20;
600                                 }
601                         }       
602
603                         LastTest = DQS_FAIL;
604                         RcvrEnDlyRmin = 0xaf;
605                                 
606                         if(!RcvrRankEnabled(ctrl, channel, receiver, is_Width128, sysinfo)) continue;
607
608                         /* for each DQS receiver enable setting */
609         
610                         TestAddr0 = Get_RcvrSysAddr(ctrl, channel, receiver, sysinfo);
611
612                         TestAddr0B = TestAddr0 + (1<<(20+2-8)); // 4MB
613         
614                         if(RcvrRankEnabled(ctrl, channel, receiver+1, is_Width128, sysinfo)) {
615                                 TestAddr1 = Get_RcvrSysAddr(ctrl, channel, receiver+1, sysinfo);
616                                 TestAddr1B = TestAddr1 + (1<<(20+2-8)); //4MB
617                                 two_ranks = 1;
618                         }
619                         else {
620                                 two_ranks = 0;
621                         }
622
623                         print_debug_dqs("\t\tTrainRcvEn53: TestAddr0B ", TestAddr0B, 2); 
624
625                         Write1LTestPattern(TestAddr0, 0, buf_a, buf_b); // rank0 of dimm, test p0
626                         Write1LTestPattern(TestAddr0B, 1, buf_a, buf_b); //rank0 of dimm, test p1
627
628                         if(two_ranks == 1) {
629                                 Write1LTestPattern(TestAddr1, 0, buf_a, buf_b); //rank 1 of dimm
630                                 Write1LTestPattern(TestAddr1B, 1, buf_a, buf_b);//rank 1 of dimm
631                         }
632
633                         if(Pass == DQS_FIRST_PASS) {
634                                 RcvrEnDly = 0; 
635                         } else {
636                                 RcvrEnDly = dqs_rcvr_dly_a[channel * 8 + receiver];
637                         }
638
639                         while ( RcvrEnDly < 0xaf) { // Sweep Delay value here
640                                 print_debug_dqs("\t\t\tTrainRcvEn541: RcvrEnDly ", RcvrEnDly, 3);
641
642                                 if(RcvrEnDly & 1) {
643                                         /* Odd steps get another pattern such that even
644                                            and odd steps alternate.
645                                            The pointers to the patterns will be swapped
646                                            at the end of the loop so they are correspond
647                                         */
648                                         PatternA = 1;
649                                         PatternB = 0;
650                                 }
651                                 else {
652                                         /* Even step */
653                                         PatternA = 0;
654                                         PatternB = 1;
655                                 }
656
657                                 /* Program current Receiver enable delay */
658                                 pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
659                                 /* FIXME: 64bit MUX */
660         
661                                 if(is_Width128) {
662                                         /* Program current Receiver enable delay chaannel b */
663                                         pci_write_config32_index_wait(ctrl->f2, 0x98, index+ 0x20, RcvrEnDly);
664                                 }
665                         
666                                 /* Program the MaxAsyncLat filed with the
667                                    current DQS receiver enable setting plus 6ns
668                                 */      
669                                 /*Porgram MaxAsyncLat to correspond with current delay */
670                                 SetMaxAL_RcvrDly(ctrl, RcvrEnDly);
671
672                                 CurrTest = DQS_FAIL;
673
674                                 Read1LTestPattern(TestAddr0);  //Cache Fill
675                                 /* ROM vs cache compare */
676                                 Test0 = CompareTestPatternQW0(channel, TestAddr0, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
677                                 proc_IOCLFLUSH(TestAddr0);
678
679                                 ResetDCTWrPtr(ctrl);
680
681                                 print_debug_dqs("\t\t\tTrainRcvEn542: Test0 ", Test0, 3); 
682
683                                 if(Test0 == DQS_PASS) {
684
685                                         Read1LTestPattern(TestAddr0B);
686                                         Test1 = CompareTestPatternQW0(channel, TestAddr0B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
687                                         proc_IOCLFLUSH(TestAddr0B);
688
689                                         ResetDCTWrPtr(ctrl);
690
691                                         print_debug_dqs("\t\t\tTrainRcvEn543: Test1 ", Test1, 3); 
692                                         
693                                         if(Test1 == DQS_PASS) {
694                                                 if(two_ranks) {
695                                                         Read1LTestPattern(TestAddr1);
696                                                         Test0 = CompareTestPatternQW0(channel, TestAddr1, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
697                                                         proc_IOCLFLUSH(TestAddr1);
698                                                         ResetDCTWrPtr(ctrl);
699
700                                                         if(Test0 == DQS_PASS) {
701                                                                 Read1LTestPattern(TestAddr1B);
702                                                                 Test1 = CompareTestPatternQW0(channel, TestAddr1B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
703                                                                 proc_IOCLFLUSH(TestAddr1B);
704                                                                 ResetDCTWrPtr(ctrl);
705
706                                                                 if(Test1 == DQS_PASS) {
707                                                                         CurrTest = DQS_PASS;
708                                                                 }
709                                                         } 
710                                                         print_debug_dqs("\t\t\tTrainRcvEn544: Test0 ", Test0, 3); 
711                                                 }
712                                                 else {
713                                                         CurrTest = DQS_PASS;
714                                                 }
715                                         }
716                                 }
717
718                                 print_debug_dqs("\t\t\tTrainRcvEn55: RcvrEnDly ", RcvrEnDly, 3); 
719
720                                 if(CurrTest == DQS_PASS) {
721                                         if(LastTest == DQS_FAIL) {
722                                                 RcvrEnDlyRmin = RcvrEnDly;
723                                                 break;
724                                         }
725                                 }
726                                 
727                                 LastTest = CurrTest;
728                                 
729                                 /* swap the rank 0 pointers */
730                                 tmp = TestAddr0;
731                                 TestAddr0 = TestAddr0B;
732                                 TestAddr0B = tmp;
733
734                                 /* swap the rank 1 pointers */
735                                 tmp = TestAddr1;
736                                 TestAddr1 = TestAddr1B;
737                                 TestAddr1B = tmp;
738
739                                 print_debug_dqs("\t\t\tTrainRcvEn56: RcvrEnDly ", RcvrEnDly, 3); 
740                                 
741                                 RcvrEnDly++;
742                                 
743                         } // while RcvrEnDly
744
745                         print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDly ", RcvrEnDly, 2); 
746
747                         if(RcvrEnDlyRmin == 0xaf) {
748                                 //no passing window
749                                 Errors |= SB_NORCVREN;
750                         }
751
752                         if(Pass == DQS_FIRST_PASS) {
753                                 // We need a better value for DQSPos trainning
754                                 RcvrEnDly = RcvrEnDlyRmin /* + RCVREN_MARGIN * T1000/64/50 */;
755                         } else {
756                                 RcvrEnDly = RcvrEnDlyRmin;
757                         }
758
759                         if(RcvrEnDly > 0xae) {
760                                 //passing window too narrow, too far delayed
761                                 Errors |= SB_SmallRCVR;
762                                 RcvrEnDly = 0xae;
763                         }
764
765                         if(Pass == DQS_SECOND_PASS) { //second pass must average vales
766                                 RcvrEnDly += dqs_rcvr_dly_a[channel * 8 + receiver] /* - (RCVREN_MARGIN * T1000/64/50)*/;
767                                 RcvrEnDly >>= 1;
768                         }
769                 
770                         dqs_rcvr_dly_a[channel * 8 + receiver] = RcvrEnDly; 
771         
772                         //Set final RcvrEnDly for this DIMM and Channel 
773                         pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
774                 
775                         if(is_Width128) {
776                                 pci_write_config32_index_wait(ctrl->f2, 0x98, index+0x20, RcvrEnDly); // channel B
777                                 if(channel) { 
778                                         pci_write_config32_index_wait(ctrl->f2, 0x98, index, CurrRcvrCHADelay);
779                                         if(RcvrEnDly > CurrRcvrCHADelay) {
780                                                 dword = RcvrEnDly - CurrRcvrCHADelay;   
781                                         }
782                                         else {
783                                                 dword = CurrRcvrCHADelay - RcvrEnDly;
784                                         }
785                                         dword *= 50;
786                                         if(dword > T1000) {
787                                                 Errors |= SB_CHA2BRCVREN;
788                                         }
789                                 }
790                         }
791
792                         print_debug_dqs("\t\tTrainRcvEn63: RcvrEnDly ", RcvrEnDly, 2); 
793
794                         if(RcvrEnDly > CTLRMaxDelay) {
795                                 CTLRMaxDelay = RcvrEnDly;
796                         }
797
798                         print_debug_dqs("\t\tTrainRcvEn64: CTLRMaxDelay ", CTLRMaxDelay, 2); 
799                         
800                 } /* receiver */
801         } /* channel */
802
803         print_debug_dqs("\tTrainRcvEn65: CTLRMaxDelay ", CTLRMaxDelay, 1); 
804
805         /* Program the MaxAsysncLat field with the largest DQS Receiver Enable setting */
806         SetMaxAL_RcvrDly(ctrl, CTLRMaxDelay);
807         ResetDCTWrPtr(ctrl);
808
809         //Enable ECC again 
810         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
811         dword &= ~(DCL_DimmEccEn);
812         dword |= ecc_bit;
813         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
814
815         if(Pass == DQS_FIRST_PASS) {
816 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
817         if(!cpu_f0_f1) 
818 #endif
819         {
820                 dword = pci_read_config32(ctrl->f2, DRAM_CTRL);
821                 dword &= ~DC_DqsRcvEnTrain;
822                 pci_write_config32(ctrl->f2, DRAM_CTRL, dword);
823         }
824         }
825
826         //Clear wrap32dis 
827
828         clear_wrap32dis();
829
830         //restore SSE2 setting
831         disable_sse2();
832
833 #if MEM_TRAIN_SEQ != 1  
834         /* We need tidy output for type 1 */
835         #if CONFIG_USE_PRINTK_IN_CAR
836         printk_debug(" CTLRMaxDelay=%02x", CTLRMaxDelay);
837         #else
838         print_debug(" CTLRMaxDelay="); print_debug_hex8(CTLRMaxDelay); 
839         #endif
840 #endif
841
842         return (CTLRMaxDelay==0xae)?1:0;
843
844 }
845
846 #define DQS_READDIR 1
847 #define DQS_WRITEDIR 0
848
849
850 static void SetDQSDelayCSR(const struct mem_controller *ctrl, unsigned channel, unsigned bytelane, unsigned direction, unsigned dqs_delay)
851 { //ByteLane could be 0-8, last is for ECC
852         unsigned index;
853         uint32_t dword;
854         unsigned shift;
855
856         dqs_delay &= 0xff;
857
858         index = (bytelane>>2) + 1 + channel * 0x20 + (direction << 2);
859         shift = bytelane;
860         while(shift>3) {
861                 shift-=4;
862         }
863         shift <<= 3; // 8 bit
864
865         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
866         dword &= ~(0x3f<<shift);
867         dword |= (dqs_delay<<shift);
868         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
869
870 }
871
872 static void SetDQSDelayAllCSR(const struct mem_controller *ctrl, unsigned channel, unsigned direction, unsigned dqs_delay)
873 {
874         unsigned index;
875         uint32_t dword;
876         int i;
877         
878         dword = 0;
879         dqs_delay &= 0xff;
880         for(i=0;i<4;i++) { 
881                 dword |= dqs_delay<<(i*8);
882         }
883
884         index = 1 + channel * 0x20 + direction * 4;
885
886         for(i=0; i<2; i++) {
887                 pci_write_config32_index_wait(ctrl->f2, 0x98, index + i, dword);
888         }
889         
890 }
891
892 static unsigned MiddleDQS(unsigned min_d, unsigned max_d)
893 {
894         unsigned size_d;
895         size_d = max_d-min_d;
896         if(size_d & 1) { //need round up
897                 min_d++;
898         }
899         return ( min_d + (size_d>>1));
900 }
901
902 static  inline void save_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, uint8_t *dqs_delay_a, uint8_t dqs_delay)
903 {
904         dqs_delay_a[channel * 2*9 + direction * 9 + bytelane] = dqs_delay;
905 }
906
907 static void WriteDQSTestPattern(unsigned addr_lo, unsigned pattern , uint8_t *buf_a)
908 {
909         WriteLNTestPattern(addr_lo, buf_a, (pattern+1) * 9);
910 }
911
912 static void ReadL18TestPattern(unsigned addr_lo) 
913 {
914         //set fs and use fs prefix to access the mem
915         __asm__ volatile (
916                 "movl %%fs:-128(%%esi), %%eax\n\t"  //TestAddr cache line
917                 "movl %%fs:-64(%%esi), %%eax\n\t"   //+1
918                 "movl %%fs:(%%esi), %%eax\n\t"  //+2
919                 "movl %%fs:64(%%esi), %%eax\n\t"   //+3
920
921                 "movl %%fs:-128(%%edi), %%eax\n\t"      //+4
922                 "movl %%fs:-64(%%edi), %%eax\n\t"       //+5
923                 "movl %%fs:(%%edi), %%eax\n\t"  //+6
924                 "movl %%fs:64(%%edi), %%eax\n\t"        //+7
925
926                 "movl %%fs:-128(%%ebx), %%eax\n\t"  //+8
927                 "movl %%fs:-64(%%ebx), %%eax\n\t"       //+9
928                 "movl %%fs:(%%ebx), %%eax\n\t"  //+10
929                 "movl %%fs:64(%%ebx), %%eax\n\t"        //+11
930
931                 "movl %%fs:-128(%%ecx), %%eax\n\t"      //+12
932                 "movl %%fs:-64(%%ecx), %%eax\n\t"       //+13
933                 "movl %%fs:(%%ecx), %%eax\n\t"  //+14
934                 "movl %%fs:64(%%ecx), %%eax\n\t"        //+15
935
936                 "movl %%fs:-128(%%edx), %%eax\n\t"      //+16
937                 "movl %%fs:-64(%%edx), %%eax\n\t"       //+17
938
939                 :: "a"(0), "b" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "S"(addr_lo+128), "D"(addr_lo+128+4*64)
940         );
941
942 }
943
944 static void ReadL9TestPattern(unsigned addr_lo) 
945 {
946
947         //set fs and use fs prefix to access the mem
948         __asm__ volatile (
949
950                 "movl %%fs:-128(%%ecx), %%eax\n\t"  //TestAddr cache line
951                 "movl %%fs:-64(%%ecx), %%eax\n\t"   //+1
952                 "movl %%fs:(%%ecx), %%eax\n\t"      //+2
953                 "movl %%fs:64(%%ecx), %%eax\n\t"   //+3
954
955                 "movl %%fs:-128(%%edx), %%eax\n\t"  //+4
956                 "movl %%fs:-64(%%edx), %%eax\n\t"   //+5
957                 "movl %%fs:(%%edx), %%eax\n\t"      //+6
958                 "movl %%fs:64(%%edx), %%eax\n\t"   //+7
959
960                 "movl %%fs:-128(%%ebx), %%eax\n\t"      //+8
961
962                 :: "a"(0), "b" (addr_lo+128+8*64), "c"(addr_lo+128), "d"(addr_lo+128+4*64) 
963         );
964
965 }
966
967
968 static void ReadDQSTestPattern(unsigned addr_lo, unsigned pattern)
969 {
970         if(pattern == 0) {
971                 ReadL9TestPattern(addr_lo);
972         }
973         else {
974                 ReadL18TestPattern(addr_lo);
975         }
976 }
977
978 static void FlushDQSTestPattern_L9(unsigned addr_lo)
979 {
980         __asm__ volatile (
981                 "clflush %%fs:-128(%%ecx)\n\t"
982                 "clflush %%fs:-64(%%ecx)\n\t"
983                 "clflush %%fs:(%%ecx)\n\t"
984                 "clflush %%fs:64(%%ecx)\n\t"
985
986                 "clflush %%fs:-128(%%eax)\n\t"
987                 "clflush %%fs:-64(%%eax)\n\t"
988                 "clflush %%fs:(%%eax)\n\t"
989                 "clflush %%fs:64(%%eax)\n\t"
990
991                 "clflush %%fs:-128(%%ebx)\n\t"
992
993                 ::  "b" (addr_lo+128+8*64), "c"(addr_lo+128), "a"(addr_lo+128+4*64)
994         );
995
996 }
997 static __attribute__((noinline)) void FlushDQSTestPattern_L18(unsigned addr_lo)
998 {
999        __asm__ volatile (
1000                 "clflush %%fs:-128(%%eax)\n\t"
1001                 "clflush %%fs:-64(%%eax)\n\t"
1002                 "clflush %%fs:(%%eax)\n\t"
1003                 "clflush %%fs:64(%%eax)\n\t"
1004
1005                 "clflush %%fs:-128(%%edi)\n\t"
1006                 "clflush %%fs:-64(%%edi)\n\t"
1007                 "clflush %%fs:(%%edi)\n\t"
1008                 "clflush %%fs:64(%%edi)\n\t"
1009
1010                 "clflush %%fs:-128(%%ebx)\n\t"
1011                 "clflush %%fs:-64(%%ebx)\n\t"
1012                 "clflush %%fs:(%%ebx)\n\t"
1013                 "clflush %%fs:64(%%ebx)\n\t"
1014
1015                 "clflush %%fs:-128(%%ecx)\n\t"
1016                 "clflush %%fs:-64(%%ecx)\n\t"
1017                 "clflush %%fs:(%%ecx)\n\t"
1018                 "clflush %%fs:64(%%ecx)\n\t"
1019
1020                 "clflush %%fs:-128(%%edx)\n\t"
1021                 "clflush %%fs:-64(%%edx)\n\t"
1022
1023                 :: "b" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "a"(addr_lo+128), "D"(addr_lo+128+4*64)
1024         );
1025 }
1026
1027 static void FlushDQSTestPattern(unsigned addr_lo, unsigned pattern )
1028 {
1029         
1030         if(pattern == 0){
1031                 FlushDQSTestPattern_L9(addr_lo);
1032         }
1033         else {
1034                 FlushDQSTestPattern_L18(addr_lo);
1035         }
1036 }
1037
1038 static unsigned CompareDQSTestPattern(unsigned channel, unsigned addr_lo, unsigned pattern, uint8_t *buf_a)
1039 {
1040         uint32_t *test_buf;
1041         unsigned bitmap = 0xff;
1042         unsigned bytelane;
1043         int i;
1044         uint32_t value;
1045         int j;
1046         uint32_t value_test;
1047
1048         test_buf = (uint32_t *)buf_a;
1049         
1050
1051         if(pattern && channel) {
1052                 addr_lo += 8; //second channel
1053                 test_buf+= 2;
1054         }
1055
1056         bytelane = 0;
1057         for(i=0;i<9*64/4;i++) {
1058                 __asm__ volatile (
1059                         "movl %%fs:(%1), %0\n\t"
1060                         :"=b"(value): "a" (addr_lo)
1061                 );
1062                 value_test = *test_buf;
1063
1064                 print_debug_dqs_pair("\t\t\t\t\t\ttest_buf= ", (unsigned)test_buf, " value = ", value_test, 7); 
1065                 print_debug_dqs_pair("\t\t\t\t\t\ttaddr_lo = ",addr_lo, " value = ", value, 7);
1066
1067                 for(j=0;j<4*8;j+=8) {
1068                         if(((value>>j)&0xff) != ((value_test>>j)& 0xff)) {
1069                                 bitmap &= ~(1<<bytelane);
1070                         }
1071                 
1072                         bytelane++;
1073                         bytelane &= 0x7; 
1074                 }
1075                 print_debug_dqs("\t\t\t\t\t\tbitmap = ", bitmap, 7);  
1076
1077                 if(bytelane == 0) {
1078                         if(pattern == 1) { //dual channel 
1079                                 addr_lo += 8; //skip over other channel's data
1080                                 test_buf += 2;
1081                         }
1082                 }
1083                 addr_lo += 4;
1084                 test_buf +=1;
1085                 
1086         }
1087
1088
1089         return bitmap;
1090
1091 }
1092
1093 static unsigned TrainDQSPos(const struct mem_controller *ctrl, unsigned channel, unsigned Direction, unsigned Pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1094 {
1095         unsigned ByteLane;
1096         unsigned Errors;
1097         unsigned BanksPresent;
1098
1099         unsigned MutualCSPassW[48];     
1100
1101         unsigned ChipSel;
1102         unsigned DQSDelay;
1103         
1104         unsigned TestAddr;
1105
1106         unsigned LastTest;
1107         unsigned RnkDlyFilterMax, RnkDlyFilterMin;
1108         unsigned RnkDlySeqPassMax, RnkDlySeqPassMin;
1109
1110         Errors = 0;
1111         BanksPresent = 0;
1112
1113         print_debug_dqs("\t\t\tTrainDQSPos begin ", 0, 3);
1114
1115         print_debug_addr("TrainDQSPos: MutualCSPassW[48] :", MutualCSPassW);
1116
1117         for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
1118                 MutualCSPassW[DQSDelay] = 0xff; // Bitmapped status per delay setting, 0xff=All positions passing (1= PASS)
1119         }
1120
1121         for(ChipSel = 0; ChipSel < 8; ChipSel++) { //logical register chipselects 0..7
1122                 print_debug_dqs("\t\t\t\tTrainDQSPos: 11 ChipSel ", ChipSel, 4); 
1123                 //FIXME: process 64MUXedMode
1124                 if(!ChipSelPresent(ctrl, ChipSel, sysinfo)) continue;
1125                 BanksPresent  = 1;
1126
1127                 TestAddr = Get_MCTSysAddr(ctrl, ChipSel, sysinfo);
1128
1129                 print_debug_dqs("\t\t\t\tTrainDQSPos: 12 TestAddr ", TestAddr, 4); 
1130
1131                 //set fs and use fs prefix to access the mem
1132                 set_FSBASE(TestAddr>>24);
1133
1134                 if(Direction == DQS_READDIR) {
1135                         print_debug_dqs("\t\t\t\tTrainDQSPos: 13 for read so write at first", 0, 4);
1136                         WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a);
1137                 }
1138
1139                 for(DQSDelay = 0; DQSDelay < 48; DQSDelay++ ){
1140                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 141 DQSDelay ", DQSDelay, 5); 
1141                         if(MutualCSPassW[DQSDelay] == 0) continue; //skip current delay value if other chipselects have failed all 8 bytelanes
1142                         SetDQSDelayAllCSR(ctrl, channel, Direction, DQSDelay);
1143                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 142 MutualCSPassW ", MutualCSPassW[DQSDelay], 5); 
1144                         if(Direction == DQS_WRITEDIR) {
1145                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 143 for write", 0, 5);
1146                                 WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a); 
1147                         }
1148                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 144 Pattern ", Pattern, 5);
1149                         ReadDQSTestPattern(TestAddr<<8, Pattern); 
1150                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 145 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1151                         MutualCSPassW[DQSDelay] &= CompareDQSTestPattern(channel, TestAddr<<8, Pattern, buf_a); //0: fail, 1=pass
1152                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 146 MutualCSPassW ", MutualCSPassW[DQSDelay], 5); 
1153                         SetTargetWTIO(TestAddr);
1154                         FlushDQSTestPattern(TestAddr<<8, Pattern); 
1155                         ResetTargetWTIO();
1156                 }
1157         }
1158
1159         if(BanksPresent) 
1160         for(ByteLane = 0; ByteLane < 8; ByteLane++) {
1161                 print_debug_dqs("\t\t\t\tTrainDQSPos: 31 ByteLane ",ByteLane, 4); 
1162
1163                 LastTest = DQS_FAIL;
1164                 RnkDlySeqPassMax = 0;
1165                 RnkDlyFilterMax = 0;
1166                 RnkDlyFilterMin = 0;
1167                 for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
1168                         if(MutualCSPassW[DQSDelay] & (1<<ByteLane)) {
1169
1170                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 321 DQSDelay ", DQSDelay, 5); 
1171                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 322 MutualCSPassW ", MutualCSPassW[DQSDelay], 5); 
1172
1173                                 RnkDlySeqPassMax = DQSDelay;
1174                                 if(LastTest == DQS_FAIL) {
1175                                         RnkDlySeqPassMin = DQSDelay; //start sequential run
1176                                 }
1177                                 if((RnkDlySeqPassMax - RnkDlySeqPassMin)>(RnkDlyFilterMax-RnkDlyFilterMin)){
1178                                         RnkDlyFilterMin = RnkDlySeqPassMin;
1179                                         RnkDlyFilterMax = RnkDlySeqPassMax;
1180                                 }
1181                                 LastTest = DQS_PASS;
1182                         }
1183                         else {
1184                                 LastTest = DQS_FAIL;
1185                         }
1186                 }
1187                 print_debug_dqs("\t\t\t\tTrainDQSPos: 33 RnkDlySeqPassMax ", RnkDlySeqPassMax, 4); 
1188
1189                 if(RnkDlySeqPassMax == 0) {
1190                         Errors |= SB_NODQSPOS; // no passing window
1191                 }
1192                 else {
1193                         print_debug_dqs("\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMax ", RnkDlyFilterMax, 4); 
1194                         print_debug_dqs("\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMin ", RnkDlyFilterMin, 4); 
1195                         if((RnkDlyFilterMax - RnkDlyFilterMin)< MIN_DQS_WNDW){
1196                                 Errors |= SB_SMALLDQS;
1197                         }
1198                         else {
1199                                 unsigned middle_dqs;
1200                                 middle_dqs = MiddleDQS(RnkDlyFilterMin, RnkDlyFilterMax); 
1201                                 print_debug_dqs("\t\t\t\tTrainDQSPos: 35 middle_dqs ",middle_dqs, 4); 
1202                                 SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, middle_dqs);
1203                                 save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, middle_dqs);
1204                         }
1205                 }       
1206
1207         }
1208
1209         print_debug_dqs("\t\t\tTrainDQSPos: end", 0xff, 3);
1210         
1211         return Errors;
1212         
1213
1214 }
1215
1216 static unsigned TrainReadDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1217 {
1218         print_debug_dqs("\t\tTrainReadPos", 0, 2); 
1219         return TrainDQSPos(ctrl, channel, DQS_READDIR, pattern, buf_a, dqs_delay_a, sysinfo);   
1220 }
1221
1222 static unsigned TrainWriteDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1223 {
1224         print_debug_dqs("\t\tTrainWritePos", 0, 2);
1225         return TrainDQSPos(ctrl, channel, DQS_WRITEDIR, pattern, buf_a, dqs_delay_a, sysinfo);
1226 }
1227
1228
1229
1230 static unsigned TrainDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1231 {
1232         static const uint32_t TestPatternJD1a[] = {
1233                                         0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW0-1, ALL-EVEN
1234                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW2-3, ALL-EVEN
1235                                         0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW4-5, ALL-EVEN
1236                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW6-7, ALL-EVEN
1237                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW0-1, DQ0-ODD
1238                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW2-3, DQ0-ODD
1239                                         0x01010101,0x01010101,0xFeFeFeFe,0xFeFeFeFe, // QW4-5, DQ0-ODD
1240                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW6-7, DQ0-ODD
1241                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW0-1, DQ1-ODD
1242                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2-3, DQ1-ODD
1243                                         0xFdFdFdFd,0xFdFdFdFd,0x02020202,0x02020202, // QW4-5, DQ1-ODD
1244                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW6-7, DQ1-ODD
1245                                         0x04040404,0x04040404,0xfBfBfBfB,0xfBfBfBfB, // QW0-1, DQ2-ODD
1246                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW2-3, DQ2-ODD
1247                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4-5, DQ2-ODD
1248                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6-7, DQ2-ODD
1249                                         0x08080808,0x08080808,0xF7F7F7F7,0xF7F7F7F7, // QW0-1, DQ3-ODD
1250                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW2-3, DQ3-ODD
1251                                         0xF7F7F7F7,0xF7F7F7F7,0x08080808,0x08080808, // QW4-5, DQ3-ODD
1252                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6-7, DQ3-ODD
1253                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW0-1, DQ4-ODD
1254                                         0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW2-3, DQ4-ODD
1255                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4-5, DQ4-ODD
1256                                         0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW6-7, DQ4-ODD
1257                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0-1, DQ5-ODD
1258                                         0xdFdFdFdF,0xdFdFdFdF,0x20202020,0x20202020, // QW2-3, DQ5-ODD
1259                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4-5, DQ5-ODD
1260                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6-7, DQ5-ODD
1261                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0-1, DQ6-ODD
1262                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW2-3, DQ6-ODD
1263                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW4-5, DQ6-ODD
1264                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW6-7, DQ6-ODD
1265                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW0-1, DQ7-ODD
1266                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW2-3, DQ7-ODD
1267                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW4-5, DQ7-ODD
1268                                         0x80808080,0x80808080,0x80808080,0x80808080  // QW6-7, DQ7-ODD
1269                 };
1270         static const uint32_t TestPatternJD1b[] = {
1271                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW0,CHA-B, ALL-EVEN
1272                                         0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW1,CHA-B, ALL-EVEN
1273                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW2,CHA-B, ALL-EVEN
1274                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW3,CHA-B, ALL-EVEN
1275                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW4,CHA-B, ALL-EVEN
1276                                         0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW5,CHA-B, ALL-EVEN
1277                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW6,CHA-B, ALL-EVEN
1278                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW7,CHA-B, ALL-EVEN
1279                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW0,CHA-B, DQ0-ODD
1280                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW1,CHA-B, DQ0-ODD
1281                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW2,CHA-B, DQ0-ODD
1282                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW3,CHA-B, DQ0-ODD
1283                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW4,CHA-B, DQ0-ODD
1284                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW5,CHA-B, DQ0-ODD
1285                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW6,CHA-B, DQ0-ODD
1286                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW7,CHA-B, DQ0-ODD
1287                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW0,CHA-B, DQ1-ODD
1288                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW1,CHA-B, DQ1-ODD
1289                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2,CHA-B, DQ1-ODD
1290                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW3,CHA-B, DQ1-ODD
1291                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW4,CHA-B, DQ1-ODD
1292                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW5,CHA-B, DQ1-ODD
1293                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW6,CHA-B, DQ1-ODD
1294                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW7,CHA-B, DQ1-ODD
1295                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW0,CHA-B, DQ2-ODD
1296                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW1,CHA-B, DQ2-ODD
1297                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW2,CHA-B, DQ2-ODD
1298                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW3,CHA-B, DQ2-ODD
1299                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4,CHA-B, DQ2-ODD
1300                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW5,CHA-B, DQ2-ODD
1301                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6,CHA-B, DQ2-ODD
1302                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW7,CHA-B, DQ2-ODD
1303                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW0,CHA-B, DQ3-ODD
1304                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW1,CHA-B, DQ3-ODD
1305                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW2,CHA-B, DQ3-ODD
1306                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW3,CHA-B, DQ3-ODD
1307                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW4,CHA-B, DQ3-ODD
1308                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW5,CHA-B, DQ3-ODD
1309                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6,CHA-B, DQ3-ODD
1310                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW7,CHA-B, DQ3-ODD
1311                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW0,CHA-B, DQ4-ODD
1312                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW1,CHA-B, DQ4-ODD
1313                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW2,CHA-B, DQ4-ODD
1314                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW3,CHA-B, DQ4-ODD
1315                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4,CHA-B, DQ4-ODD
1316                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW5,CHA-B, DQ4-ODD
1317                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW6,CHA-B, DQ4-ODD
1318                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW7,CHA-B, DQ4-ODD
1319                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0,CHA-B, DQ5-ODD
1320                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW1,CHA-B, DQ5-ODD
1321                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW2,CHA-B, DQ5-ODD
1322                                         0x20202020,0x20202020,0x20202020,0x20202020, // QW3,CHA-B, DQ5-ODD
1323                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4,CHA-B, DQ5-ODD
1324                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW5,CHA-B, DQ5-ODD
1325                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6,CHA-B, DQ5-ODD
1326                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW7,CHA-B, DQ5-ODD
1327                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0,CHA-B, DQ6-ODD
1328                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW1,CHA-B, DQ6-ODD
1329                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW2,CHA-B, DQ6-ODD
1330                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW3,CHA-B, DQ6-ODD
1331                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW4,CHA-B, DQ6-ODD
1332                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW5,CHA-B, DQ6-ODD
1333                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW6,CHA-B, DQ6-ODD
1334                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW7,CHA-B, DQ6-ODD
1335                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW0,CHA-B, DQ7-ODD
1336                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW1,CHA-B, DQ7-ODD
1337                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW2,CHA-B, DQ7-ODD
1338                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW3,CHA-B, DQ7-ODD
1339                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW4,CHA-B, DQ7-ODD
1340                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW5,CHA-B, DQ7-ODD
1341                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW6,CHA-B, DQ7-ODD
1342                                         0x80808080,0x80808080,0x80808080,0x80808080  // QW7,CHA-B, DQ7-ODD
1343                 };
1344         uint8_t pattern_buf_x[64 * 18 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */
1345         uint8_t *buf_a;
1346
1347         unsigned pattern;
1348         uint32_t dword;
1349         uint32_t ecc_bit;
1350         unsigned Errors;
1351         unsigned channel;
1352         int i;
1353         unsigned DQSWrDelay;
1354         unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
1355         uint8_t *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
1356
1357         //enable SSE2
1358         enable_sse2();
1359
1360         //wrap32dis
1361         set_wrap32dis();
1362
1363         //disable ECC temp
1364         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
1365         ecc_bit = dword & DCL_DimmEccEn;
1366         dword &= ~(DCL_DimmEccEn);
1367         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
1368
1369         //SetupDqsPattern
1370         buf_a = (uint8_t *)(((uint32_t)(&pattern_buf_x[0]) + 0x10) & (~0xf));
1371
1372         if(is_Width128){
1373                 pattern = 1;
1374                 for(i=0;i<16*18;i++) {
1375                         *((uint32_t *)(buf_a + i*4)) = TestPatternJD1b[i];
1376                  }
1377         }
1378         else {
1379                 pattern = 0;
1380                 for(i=0; i<16*9;i++) {
1381                         *((uint32_t *)(buf_a + i*4)) = TestPatternJD1a[i];
1382                 }
1383                 
1384         }
1385
1386         print_debug_dqs("\r\nTrainDQSRdWrPos: 0 ctrl ", ctrl->node_id, 0); 
1387
1388         print_debug_addr("TrainDQSRdWrPos: buf_a:", buf_a);
1389
1390         Errors = 0;
1391
1392         channel = 0;
1393         while( (channel<2) && (!Errors)) {
1394                 print_debug_dqs("\tTrainDQSRdWrPos: 1 channel ",channel, 1); 
1395                 for(DQSWrDelay = 0; DQSWrDelay < 48; DQSWrDelay++) {
1396                         unsigned err;
1397                         SetDQSDelayAllCSR(ctrl, channel, DQS_WRITEDIR, DQSWrDelay);
1398                         print_debug_dqs("\t\tTrainDQSRdWrPos: 21 DQSWrDelay ", DQSWrDelay, 2); 
1399                         err= TrainReadDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
1400                         print_debug_dqs("\t\tTrainDQSRdWrPos: 22 err ",err, 2); 
1401                         if(err == 0) break;
1402                         Errors |= err;
1403                 }
1404
1405                 print_debug_dqs("\tTrainDQSRdWrPos: 3 DQSWrDelay ", DQSWrDelay, 1); 
1406
1407                 if(DQSWrDelay < 48) {
1408                         Errors = TrainWriteDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
1409                         print_debug_dqs("\tTrainDQSRdWrPos: 4 Errors ", Errors, 1); 
1410
1411                 }
1412                 channel++;
1413                 if(!is_Width128){
1414                         //FIXME: 64MuxMode??    
1415                         channel++; // skip channel if 64-bit mode
1416                 }
1417         }
1418
1419         //Enable ECC again
1420         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
1421         dword &= ~(DCL_DimmEccEn);
1422         dword |= ecc_bit;
1423         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
1424
1425         //Clear wrap32dis
1426
1427         clear_wrap32dis();
1428
1429         //restore SSE2 setting
1430         disable_sse2();
1431
1432         print_debug_dqs("TrainDQSRdWrPos: ", 5, 0); 
1433         
1434         return Errors;
1435
1436 }
1437 static inline uint8_t get_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, uint8_t *dqs_delay_a)
1438 {
1439         return dqs_delay_a[channel * 2*9 + direction * 9 + bytelane];
1440 }
1441
1442 static unsigned CalcEccDQSPos(unsigned channel,unsigned ByteLane0, unsigned ByteLane1, unsigned InterFactor, unsigned Direction, uint8_t *dqs_delay_a)
1443 /* InterFactor: 0: 100% ByteLane 0
1444                 0x80: 50% between ByteLane 0 and 1
1445                 0xff: 99.6% ByteLane 1 and 0.4% like 0
1446 */
1447 {
1448         unsigned DQSDelay0, DQSDelay1;
1449         unsigned DQSDelay;
1450         
1451         DQSDelay0 = get_dqs_delay(channel, ByteLane0, Direction, dqs_delay_a);
1452         DQSDelay1 = get_dqs_delay(channel, ByteLane1, Direction, dqs_delay_a); 
1453         
1454         if(DQSDelay0>DQSDelay1) {
1455                 DQSDelay = DQSDelay0 - DQSDelay1;
1456                 InterFactor = 0xff - InterFactor;
1457         }
1458         else {
1459                 DQSDelay = DQSDelay1 - DQSDelay0;
1460         }
1461
1462         DQSDelay *= InterFactor;
1463
1464         DQSDelay >>= 8; // /255
1465
1466         if(DQSDelay0>DQSDelay1) {
1467                 DQSDelay += DQSDelay1;
1468         }
1469         else {
1470                 DQSDelay += DQSDelay0;
1471         }
1472
1473         return DQSDelay;
1474
1475 }
1476
1477 static void SetEccDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1478 {       
1479         unsigned channel;
1480         unsigned ByteLane;
1481         unsigned Direction;
1482         unsigned lane0, lane1, ratio;
1483         unsigned dqs_delay;
1484
1485         unsigned direction[] = { DQS_READDIR, DQS_WRITEDIR };
1486         int i;
1487         uint8_t *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
1488
1489         ByteLane = 8;
1490
1491         for(channel = 0; channel < 2; channel++) {
1492                 for(i=0;i<2;i++) {
1493                         Direction = direction[i];
1494                         lane0 = 4; lane1 = 5; ratio = 0;
1495                         dqs_delay = CalcEccDQSPos(channel, lane0, lane1, ratio, Direction, dqs_delay_a);
1496                         print_debug_dqs_pair("\t\tSetEccDQSRdWrPos: channel ", channel, Direction==DQS_READDIR? " R dqs_delay":" W dqs_delay",  dqs_delay, 2); 
1497                         SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, dqs_delay);
1498                         save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, dqs_delay);
1499                 }
1500         }
1501 }
1502
1503 static unsigned train_DqsRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
1504 {
1505         print_debug_dqs("\r\ntrain_DqsRcvrEn: begin ctrl ", ctrl->node_id, 0); 
1506         if(TrainRcvrEn(ctrl, Pass, sysinfo)) {
1507                 return 1;
1508         }
1509         print_debug_dqs("\r\ntrain_DqsRcvrEn: end ctrl ", ctrl->node_id, 0); 
1510         return 0;
1511         
1512 }
1513 static unsigned train_DqsPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1514 {
1515         print_debug_dqs("\r\ntrain_DqsPos: begin ctrl ", ctrl->node_id, 0); 
1516         if(TrainDQSRdWrPos(ctrl, sysinfo) != 0) {
1517                 print_err("\r\nDQS Training Rd Wr failed ctrl"); print_err_hex8(ctrl->node_id); print_err("\r\n");
1518                 return 1;
1519         }
1520         else {
1521                 SetEccDQSRdWrPos(ctrl, sysinfo);
1522         }
1523         print_debug_dqs("\r\ntrain_DqsPos: end ctrl ", ctrl->node_id, 0); 
1524         return 0;
1525         
1526 }
1527
1528 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1529 static void f0_svm_workaround(int controllers, const struct mem_controller *ctrl, tsc_t *tsc0, struct sys_info *sysinfo)
1530 {
1531         tsc_t tsc1[8];
1532         unsigned cpu_f0_f1[8];
1533         int i;
1534
1535         print_debug_addr("dqs_timing: tsc1[8] :", tsc1);
1536
1537         for(i = 0; i < controllers; i++) {
1538                 if (!sysinfo->ctrl_present[i])
1539                         continue;
1540
1541                 /* Skip everything if I don't have any memory on this controller */
1542                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1543
1544                 uint32_t dword;
1545
1546                 cpu_f0_f1[i] = is_cpu_pre_f2_in_bsp(i);
1547
1548                 if(!cpu_f0_f1[i]) continue;
1549
1550                 dword = pci_read_config32(ctrl[i].f2, DRAM_CTRL);
1551                 dword &= ~DC_DqsRcvEnTrain;
1552                 pci_write_config32(ctrl[i].f2, DRAM_CTRL, dword);
1553
1554                 dword = pci_read_config32(ctrl[i].f2, DRAM_INIT);
1555                 dword |= DI_EnDramInit;
1556                 pci_write_config32(ctrl[i].f2, DRAM_INIT, dword);
1557                 dword &= ~DI_EnDramInit;
1558                 pci_write_config32(ctrl[i].f2, DRAM_INIT, dword);
1559
1560                 tsc1[i] = rdtsc();
1561                 print_debug_dqs_tsc("begin: tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
1562
1563                 dword = tsc1[i].lo + tsc0[i].lo;
1564                 if((dword<tsc1[i].lo) || (dword<tsc0[i].lo)) {
1565                         tsc1[i].hi++;
1566                 }
1567                 tsc1[i].lo = dword;
1568                 tsc1[i].hi+= tsc0[i].hi;
1569
1570                 print_debug_dqs_tsc("end  : tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
1571
1572         }
1573
1574         for(i = 0; i < controllers; i++) {
1575                 if (!sysinfo->ctrl_present[i])
1576                         continue;
1577
1578                 /* Skip everything if I don't have any memory on this controller */
1579                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1580
1581                 if(!cpu_f0_f1[i]) continue;
1582
1583                 tsc_t tsc;
1584
1585                 do {
1586                         tsc = rdtsc();
1587                 } while ((tsc1[i].hi>tsc.hi) || ((tsc1[i].hi==tsc.hi) && (tsc1[i].lo>tsc.lo)));
1588
1589                 print_debug_dqs_tsc("end  : tsc ", i, tsc.hi, tsc.lo, 2);
1590         }
1591
1592 }
1593
1594 #endif
1595
1596
1597 /* setting variable mtrr, comes from linux kernel source */
1598 static void set_var_mtrr_dqs(
1599         unsigned int reg, unsigned long basek, unsigned long sizek,
1600         unsigned char type, unsigned address_bits)
1601 {
1602         msr_t base, mask;
1603         unsigned address_mask_high;
1604
1605         address_mask_high = ((1u << (address_bits - 32u)) - 1u);
1606
1607         base.hi = basek >> 22;
1608         base.lo  = basek << 10;
1609
1610         if (sizek < 4*1024*1024) {
1611                 mask.hi = address_mask_high;
1612                 mask.lo = ~((sizek << 10) -1);
1613         }
1614         else {
1615                 mask.hi = address_mask_high & (~((sizek >> 22) -1));
1616                 mask.lo = 0;
1617         }
1618
1619         if (reg >= 8)
1620                 return;
1621
1622         if (sizek == 0) {
1623                 msr_t zero;
1624                 zero.lo = zero.hi = 0;
1625                 /* The invalid bit is kept in the mask, so we simply clear the
1626                    relevant mask register to disable a range. */
1627                 wrmsr (MTRRphysMask_MSR(reg), zero);
1628         } else {
1629                 /* Bit 32-35 of MTRRphysMask should be set to 1 */
1630                 base.lo |= type;
1631                 mask.lo |= 0x800;
1632                 wrmsr (MTRRphysBase_MSR(reg), base);
1633                 wrmsr (MTRRphysMask_MSR(reg), mask);
1634         }
1635 }
1636
1637
1638 /* fms: find most sigificant bit set, stolen from Linux Kernel Source. */
1639 static inline unsigned int fms(unsigned int x)
1640 {
1641         int r;
1642
1643         __asm__("bsrl %1,%0\n\t"
1644                 "jnz 1f\n\t"
1645                 "movl $0,%0\n"
1646                 "1:" : "=r" (r) : "g" (x));
1647         return r;
1648 }
1649
1650 /* fms: find least sigificant bit set */
1651 static inline unsigned int fls(unsigned int x)
1652 {
1653         int r;
1654
1655         __asm__("bsfl %1,%0\n\t"
1656                 "jnz 1f\n\t"
1657                 "movl $32,%0\n"
1658                 "1:" : "=r" (r) : "g" (x));
1659         return r;
1660 }
1661
1662 static unsigned int range_to_mtrr(unsigned int reg,
1663         unsigned long range_startk, unsigned long range_sizek,
1664         unsigned long next_range_startk, unsigned char type, unsigned address_bits)
1665 {
1666         if (!range_sizek || (reg >= 8)) {
1667                 return reg;
1668         }
1669         while(range_sizek) {
1670                 unsigned long max_align, align;
1671                 unsigned long sizek;
1672                 /* Compute the maximum size I can make a range */
1673                 max_align = fls(range_startk);
1674                 align = fms(range_sizek);
1675                 if (align > max_align) {
1676                         align = max_align;
1677                 }
1678                 sizek = 1 << align;
1679 #if MEM_TRAIN_SEQ != 1
1680         #if CONFIG_USE_PRINTK_IN_CAR
1681                 printk_debug("Setting variable MTRR %d, base: %4dMB, range: %4dMB, type %s\r\n",
1682                         reg, range_startk >>10, sizek >> 10,
1683                         (type==MTRR_TYPE_UNCACHEABLE)?"UC":
1684                             ((type==MTRR_TYPE_WRBACK)?"WB":"Other")
1685                         );
1686         #else
1687                 print_debug("Setting variable MTRR "); print_debug_hex8(reg); print_debug(", base: "); print_debug_hex16(range_startk>>10); 
1688                         print_debug("MB, range: "); print_debug_hex16(sizek >> 10); print_debug("MB, type "); 
1689                         print_debug( (type==MTRR_TYPE_UNCACHEABLE)?"UC\r\n":
1690                                       ((type==MTRR_TYPE_WRBACK)?"WB\r\n":"Other\r\n")
1691                                    );
1692         #endif
1693 #endif
1694                 set_var_mtrr_dqs(reg++, range_startk, sizek, type, address_bits);
1695                 range_startk += sizek;
1696                 range_sizek -= sizek;
1697                 if (reg >= 8)
1698                         break;
1699         }
1700         return reg;
1701 }
1702
1703 static void set_top_mem_ap(unsigned tom_k, unsigned tom2_k)
1704 {
1705         msr_t msr;
1706
1707         /* Now set top of memory */
1708         msr.lo = (tom2_k & 0x003fffff) << 10;
1709         msr.hi = (tom2_k & 0xffc00000) >> 22;
1710         wrmsr(TOP_MEM2, msr);
1711
1712         msr.lo = (tom_k & 0x003fffff) << 10;
1713         msr.hi = (tom_k & 0xffc00000) >> 22;
1714         wrmsr(TOP_MEM, msr);
1715 }
1716
1717 static void setup_mtrr_dqs(unsigned tom_k, unsigned tom2_k){
1718         unsigned reg;
1719         msr_t msr;
1720
1721 #if 0
1722         //still enable from cache_as_ram.inc
1723         msr = rdmsr(SYSCFG_MSR);
1724         msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
1725         wrmsr(SYSCFG_MSR,msr);
1726 #endif
1727
1728         //[0,512k), [512k, 640k)
1729         msr.hi = 0x1e1e1e1e;
1730         msr.lo = msr.hi;
1731         wrmsr(0x250, msr);
1732         wrmsr(0x258, msr);
1733
1734         //[1M, TOM)
1735         reg = range_to_mtrr(2, 0, tom_k,4*1024*1024, MTRR_TYPE_WRBACK, 40);
1736
1737         //[4G, TOM2)
1738         if(tom2_k) {
1739                 //enable tom2 and type
1740                 msr = rdmsr(SYSCFG_MSR);
1741                 msr.lo |= (1<<21) | (1<<22); //MtrrTom2En and Tom2ForceMemTypeWB
1742                 wrmsr(SYSCFG_MSR, msr);
1743         }
1744
1745 }
1746
1747 static void clear_mtrr_dqs(unsigned tom2_k){
1748         msr_t msr;
1749         unsigned i;
1750
1751         //still enable from cache_as_ram.inc
1752         msr = rdmsr(SYSCFG_MSR);
1753         msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
1754         wrmsr(SYSCFG_MSR,msr);
1755
1756         //[0,512k), [512k, 640k)
1757         msr.hi = 0;
1758         msr.lo = msr.hi;
1759         wrmsr(0x250, msr);
1760         wrmsr(0x258, msr);
1761
1762         //[1M, TOM)
1763         for(i=0x204;i<0x210;i++) {
1764                 wrmsr(i, msr);
1765         }
1766
1767         //[4G, TOM2)
1768         if(tom2_k) {
1769                 //enable tom2 and type
1770                 msr = rdmsr(SYSCFG_MSR);
1771                 msr.lo &= ~((1<<21) | (1<<22)); //MtrrTom2En and Tom2ForceMemTypeWB
1772                 wrmsr(SYSCFG_MSR, msr);
1773         }
1774 }
1775
1776 static void set_htic_bit(unsigned i, unsigned val, unsigned bit)
1777 {
1778         uint32_t dword;
1779         dword = pci_read_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL);
1780         dword &= ~(1<<bit);
1781         dword |= ((val & 1) <<bit);
1782         pci_write_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL, dword);
1783 }
1784
1785
1786 static unsigned get_htic_bit(unsigned i, unsigned bit)
1787 {
1788         uint32_t dword;
1789         dword = pci_read_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL);
1790         dword &= (1<<bit);
1791         return dword;
1792 }
1793
1794 static void wait_till_sysinfo_in_ram(void)
1795 {
1796         while(1) {
1797                 if(get_htic_bit(0, 9)) return;
1798         }
1799 }
1800
1801 static void set_sysinfo_in_ram(unsigned val)
1802 {
1803         set_htic_bit(0, val, 9);
1804 }
1805
1806
1807 #if MEM_TRAIN_SEQ == 0
1808
1809
1810 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1811 static void dqs_timing(int controllers, const struct mem_controller *ctrl, tsc_t *tsc0, struct sys_info *sysinfo)
1812 #else
1813 static void dqs_timing(int controllers, const struct mem_controller *ctrl, struct sys_info *sysinfo)
1814 #endif
1815 {
1816         int  i;
1817
1818         tsc_t tsc[5];
1819
1820         //need to enable mtrr, so dqs training could access the test address
1821         setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
1822
1823         for(i = 0; i < controllers; i++) {
1824                 if (!sysinfo->ctrl_present[ i ])
1825                         continue;
1826
1827                 /* Skip everything if I don't have any memory on this controller */
1828                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1829
1830                 fill_mem_cs_sysinfo(i, ctrl+i, sysinfo);
1831         }
1832
1833         tsc[0] = rdtsc();
1834         for(i = 0; i < controllers; i++) {
1835                 if (!sysinfo->ctrl_present[ i ])
1836                         continue;
1837
1838                 /* Skip everything if I don't have any memory on this controller */
1839                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1840
1841                 print_debug("DQS Training:RcvrEn:Pass1: ");
1842                 print_debug_hex8(i);
1843                 if(train_DqsRcvrEn(ctrl+i, 1, sysinfo)) goto out;
1844                 print_debug(" done\r\n");
1845         }
1846
1847         tsc[1] = rdtsc();
1848 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1849         f0_svm_workaround(controllers, ctrl, tsc0, sysinfo);
1850 #endif
1851
1852         tsc[2] = rdtsc();
1853         for(i = 0; i < controllers; i++) {
1854                 if (!sysinfo->ctrl_present[i])
1855                         continue;
1856
1857                 /* Skip everything if I don't have any memory on this controller */
1858                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1859
1860                 print_debug("DQS Training:DQSPos: ");
1861                 print_debug_hex8(i);
1862                 if(train_DqsPos(ctrl+i, sysinfo)) goto out;
1863                 print_debug(" done\r\n");
1864         }
1865
1866         tsc[3] = rdtsc();
1867         for(i = 0; i < controllers; i++) {
1868                 if (!sysinfo->ctrl_present[i])
1869                         continue;
1870
1871                 /* Skip everything if I don't have any memory on this controller */
1872                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1873
1874                 print_debug("DQS Training:RcvrEn:Pass2: ");
1875                 print_debug_hex8(i);
1876                 if(train_DqsRcvrEn(ctrl+i, 2, sysinfo)) goto out;
1877                 print_debug(" done\r\n");
1878                 sysinfo->mem_trained[i]=1;
1879         }
1880
1881 out:
1882         tsc[4] = rdtsc();
1883         clear_mtrr_dqs(sysinfo->tom2_k);
1884
1885
1886         for(i=0;i<5;i++) {
1887                 print_debug_dqs_tsc_x("DQS Training:tsc", i,  tsc[i].hi, tsc[i].lo);
1888         }
1889
1890
1891         
1892 }
1893
1894 #endif
1895
1896
1897 #if MEM_TRAIN_SEQ > 0 
1898
1899 static void dqs_timing(int i, const struct mem_controller *ctrl, struct sys_info *sysinfo, unsigned v)
1900 {
1901
1902         int ii;
1903
1904          tsc_t tsc[4];
1905
1906         if(sysinfo->mem_trained[i] != 0x80) return;
1907
1908 #if MEM_TRAIN_SEQ == 1
1909         //need to enable mtrr, so dqs training could access the test address
1910         setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
1911 #endif
1912
1913         fill_mem_cs_sysinfo(i, ctrl, sysinfo);
1914
1915         if(v) {
1916                 tsc[0] = rdtsc();
1917
1918                 print_debug("set DQS timing:RcvrEn:Pass1: ");
1919                 print_debug_hex8(i);
1920         }
1921         if(train_DqsRcvrEn(ctrl, 1,  sysinfo)) {
1922                 sysinfo->mem_trained[i]=0x81; //
1923                 goto out;
1924         }
1925
1926         if(v) {
1927                 print_debug(" done\r\n");
1928                 tsc[1] = rdtsc();
1929                 print_debug("set DQS timing:DQSPos: ");
1930                 print_debug_hex8(i);
1931         }
1932
1933         if(train_DqsPos(ctrl, sysinfo)) {
1934                 sysinfo->mem_trained[i]=0x82; //
1935                 goto out;
1936         }
1937         
1938         if(v) {
1939                 print_debug(" done\r\n");
1940                 tsc[2] = rdtsc();
1941
1942                 print_debug("set DQS timing:RcvrEn:Pass2: ");
1943                 print_debug_hex8(i);
1944         }
1945         if(train_DqsRcvrEn(ctrl, 2,  sysinfo)){
1946                 sysinfo->mem_trained[i]=0x83; //
1947                 goto out;
1948         }
1949
1950         if(v) {
1951                 print_debug(" done\r\n");
1952
1953                 tsc[3] = rdtsc();
1954         }
1955
1956 out:
1957 #if MEM_TRAIN_SEQ == 1
1958         clear_mtrr_dqs(sysinfo->tom2_k);
1959 #endif
1960
1961         if(v) {
1962                 for(ii=0;ii<4;ii++) {
1963                       print_debug_dqs_tsc_x("Total DQS Training : tsc ", ii,  tsc[ii].hi, tsc[ii].lo);
1964                 }
1965         }
1966         
1967         if(sysinfo->mem_trained[i] == 0x80) {
1968                 sysinfo->mem_trained[i]=1;
1969         }
1970
1971 }
1972 #endif
1973
1974 #if MEM_TRAIN_SEQ == 1
1975 static void train_ram(unsigned nodeid, struct sys_info *sysinfo, struct sys_info *sysinfox)
1976 {
1977         dqs_timing(nodeid, &sysinfo->ctrl[nodeid], sysinfo, 0); // keep the output tidy
1978 //      memcpy(&sysinfox->dqs_rcvr_dly_a[nodeid * 2 * 8],&sysinfo->dqs_rcvr_dly_a[nodeid * 2 * 8], 2*8);
1979 //      memcpy(&sysinfox->dqs_delay_a[nodeid * 2 * 2 * 9], &sysinfo->dqs_delay_a[nodeid * 2 * 2 * 9], 2 * 2 * 9);
1980         sysinfox->mem_trained[nodeid] = sysinfo->mem_trained[nodeid];
1981
1982 }
1983 static void copy_and_run_ap_code_in_car(unsigned ret_addr);
1984 static inline void train_ram_on_node(unsigned nodeid, unsigned coreid, struct sys_info *sysinfo, unsigned retcall)
1985 {
1986         if(coreid) return; // only do it on core0
1987         struct sys_info *sysinfox = ((CONFIG_LB_MEM_TOPK<<10) - DCACHE_RAM_GLOBAL_VAR_SIZE);
1988         wait_till_sysinfo_in_ram(); // use pci to get it
1989
1990         if(sysinfox->mem_trained[nodeid] == 0x80) {
1991         #if 0
1992                 sysinfo->tom_k = sysinfox->tom_k;
1993                 sysinfo->tom2_k = sysinfox->tom2_k;
1994                 sysinfo->meminfo[nodeid].is_Width128 = sysinfox->meminfo[nodeid].is_Width128;
1995                 sysinfo->mem_trained[nodeid] = sysinfox->mem_trained[nodeid];
1996                 memcpy(&sysinfo->ctrl[nodeid], &sysinfox->ctrl[nodeid], sizeof(struct mem_controller));
1997         #else
1998                 memcpy(sysinfo, sysinfox, DCACHE_RAM_GLOBAL_VAR_SIZE);
1999         #endif
2000                 set_top_mem_ap(sysinfo->tom_k, sysinfo->tom2_k); // keep the ap's tom consistent with bsp's
2001         #if CONFIG_AP_CODE_IN_CAR == 0
2002                 print_debug("CODE IN ROM AND RUN ON NODE:"); print_debug_hex8(nodeid); print_debug("\r\n");
2003                 train_ram(nodeid, sysinfo, sysinfox);
2004         #else
2005                 /* Can copy dqs_timing to ap cache and run from cache?
2006                 * we need coreboot_ap_car.rom? and treat it as coreboot_ram.rom for ap ?
2007                 */
2008                 copy_and_run_ap_code_in_car(retcall);
2009                 // will go back by jump
2010         #endif
2011         }
2012 }
2013 #endif