This patch for the AMD K8 allows a single DIMM to be populated in the
[coreboot.git] / src / northbridge / amd / amdk8 / raminit_f_dqs.c
1 /*
2  * This file is part of the coreboot project.
3  *
4  * Copyright (C) 2005 YingHai Lu
5  * Copyright (C) 2008 Advanced Micro Devices, Inc.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; version 2 of the License.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19 */
20
21 //0: mean no debug info
22 #define DQS_TRAIN_DEBUG 0
23
24 static inline void print_debug_dqs(const char *str, unsigned val, unsigned level) 
25 {
26 #if DQS_TRAIN_DEBUG > 0
27         if(DQS_TRAIN_DEBUG > level) {
28                 #if CONFIG_USE_PRINTK_IN_CAR
29                 printk_debug("%s%x\r\n", str, val);
30                 #else
31                 print_debug(str); print_debug_hex32(val); print_debug("\r\n");
32                 #endif
33         }
34 #endif
35 }
36
37 static inline void print_debug_dqs_pair(const char *str, unsigned val, const char *str2, unsigned val2, unsigned level)
38 {
39 #if DQS_TRAIN_DEBUG > 0
40         if(DQS_TRAIN_DEBUG > level) {
41                 #if CONFIG_USE_PRINTK_IN_CAR
42                 printk_debug("%s%08x%s%08x\r\n", str, val, str2, val2);
43                 #else
44                 print_debug(str); print_debug_hex32(val); print_debug(str2); print_debug_hex32(val2); print_debug("\r\n");
45                 #endif
46         }
47 #endif
48 }
49
50 static inline void print_debug_dqs_tsc(const char *str, unsigned i, unsigned val, unsigned val2, unsigned level)
51 {
52 #if DQS_TRAIN_DEBUG > 0
53         if(DQS_TRAIN_DEBUG > level) {
54                 #if CONFIG_USE_PRINTK_IN_CAR
55                 printk_debug("%s[%02x]=%08x%08x\r\n", str, i, val, val2);
56                 #else
57                 print_debug(str); print_debug("["); print_debug_hex8(i); print_debug("]="); print_debug_hex32(val); print_debug_hex32(val2); print_debug("\r\n");
58                 #endif
59         }
60 #endif
61 }
62
63 static inline void print_debug_dqs_tsc_x(const char *str, unsigned i, unsigned val, unsigned val2)
64 {
65         #if CONFIG_USE_PRINTK_IN_CAR
66         printk_debug("%s[%02x]=%08x%08x\r\n", str, i, val, val2);
67         #else
68         print_debug(str); print_debug("["); print_debug_hex8(i); print_debug("]="); print_debug_hex32(val); print_debug_hex32(val2); print_debug("\r\n");
69         #endif
70
71 }
72
73 static void fill_mem_cs_sysinfo(unsigned nodeid, const struct mem_controller *ctrl, struct sys_info *sysinfo)
74 {
75
76         int i;
77         sysinfo->mem_base[nodeid] = pci_read_config32(ctrl->f1, 0x40 + (nodeid<<3));
78
79         for(i=0;i<8; i++) {
80                 sysinfo->cs_base[nodeid*8+i] = pci_read_config32(ctrl->f2, 0x40 + (i<<2));
81         }
82
83         sysinfo->hole_reg[nodeid] = pci_read_config32(ctrl->f1, 0xf0);  
84
85 }
86 static unsigned Get_MCTSysAddr(const struct mem_controller *ctrl,  unsigned cs_idx, struct sys_info *sysinfo)
87 {
88         uint32_t dword;
89         uint32_t mem_base;
90         unsigned nodeid = ctrl->node_id;
91
92 #if HW_MEM_HOLE_SIZEK != 0      
93         uint32_t hole_reg;
94 #endif
95
96         //get the local base addr of the chipselect
97         dword = sysinfo->cs_base[nodeid * 8 + cs_idx];
98         dword &= 0xfffffff0;
99
100         //sys addr= node base + local cs base
101         mem_base = sysinfo->mem_base[nodeid];
102         mem_base &= 0xffff0000;
103
104         dword += mem_base;
105 #if HW_MEM_HOLE_SIZEK != 0
106         hole_reg = sysinfo->hole_reg[nodeid];
107         if(hole_reg & 1) {
108                 unsigned hole_startk;
109                 hole_startk = (hole_reg & (0xff<<24)) >> 10;
110                 if( (dword >= (hole_startk<<2)) && (dword < ((4*1024*1024)<<2))) { 
111                         dword += ((4*1024*1024 - hole_startk)<<2);
112                 }
113         }  
114 #endif
115
116         //add 1MB offset to avoid compat area
117         dword += (1<<(20-8));
118                 
119         //So final result is upper 32 bit addr 
120         
121         return dword;
122
123 }
124
125 static unsigned Get_RcvrSysAddr(const struct mem_controller * ctrl, unsigned channel, unsigned cs_idx, struct sys_info *sysinfo)
126 {
127         return Get_MCTSysAddr(ctrl, cs_idx, sysinfo);
128
129 }
130
131 static inline unsigned long read_cr4(void)
132 {
133         unsigned long cr4;
134         asm volatile ("movl %%cr4, %0" : "=r" (cr4));
135         return cr4;
136 }
137
138 static inline void write_cr4(unsigned long cr4)
139 {
140         asm volatile ("movl %0, %%cr4" : : "r" (cr4));
141 }
142
143
144 static inline void enable_sse2()
145 {
146         unsigned long cr4;
147         cr4 = read_cr4();
148         cr4 |= (1<<9);
149         write_cr4(cr4);
150 }
151
152 static inline void disable_sse2()
153 {
154         unsigned long cr4;
155         cr4 = read_cr4();
156         cr4 &= ~(1<<9);
157         write_cr4(cr4);
158 }
159
160
161 static void set_wrap32dis(void) {
162         msr_t msr;
163         
164         msr = rdmsr(0xc0010015);
165         msr.lo |= (1<<17);
166         
167         wrmsr(0xc0010015, msr);
168
169 }
170
171 static void clear_wrap32dis(void) {
172         msr_t msr;
173
174         msr = rdmsr(0xc0010015);
175         msr.lo &= ~(1<<17);
176
177         wrmsr(0xc0010015, msr);
178
179 }
180
181 static void set_FSBASE(uint32_t addr_hi)
182 {
183         msr_t msr;
184
185         //set fs and use fs prefix to access the mem
186         msr.hi = addr_hi;
187         msr.lo = 0;
188         wrmsr(0xc0000100, msr); //FS_BASE
189
190 }
191
192 static unsigned ChipSelPresent(const struct mem_controller *ctrl, unsigned cs_idx, struct sys_info *sysinfo)
193 {
194         unsigned enabled;
195         unsigned nodeid = ctrl->node_id;
196         
197
198         enabled = sysinfo->cs_base[nodeid * 8 + cs_idx];
199         enabled &= 1;
200
201         return enabled;
202
203 }
204
205 static unsigned RcvrRankEnabled(const struct mem_controller *ctrl, int channel, int cs_idx, unsigned is_Width128, struct sys_info *sysinfo)
206 {
207         return ChipSelPresent(ctrl, cs_idx, sysinfo);
208 }
209
210 static void WriteLNTestPattern(unsigned addr_lo, uint8_t *buf_a, unsigned line_num)
211 {
212         __asm__ volatile (
213                 "1:\n\t"
214                 "movdqa (%3), %%xmm0\n\t"
215                 "movntdq %%xmm0, %%fs:(%0)\n\t" /* xmm0 is 128 bit */
216                 "addl %1, %0\n\t"
217                 "addl %1, %3\n\t"
218                 "loop 1b\n\t"
219
220                 :: "a" (addr_lo), "d" (16), "c" (line_num * 4), "b"(buf_a)
221         );
222
223
224 }
225
226 static void Write1LTestPattern(unsigned addr, unsigned p, uint8_t *buf_a, uint8_t *buf_b) 
227 {
228         uint8_t *buf;
229         if(p==1) { buf = buf_b; }
230         else { buf = buf_a; }
231
232         set_FSBASE (addr>>24);
233
234         WriteLNTestPattern(addr<<8, buf, 1);
235 }
236
237 static void Read1LTestPattern(unsigned addr) 
238 {
239         unsigned value;
240
241         set_FSBASE(addr>>24);
242         
243         /* 1st move causes read fill (to exclusive or shared)*/
244         __asm__ volatile (
245                 "movl %%fs:(%1), %0\n\t"
246                 :"=b"(value): "a" (addr<<8)
247         );
248         
249 }
250
251 #define DQS_PASS 0
252 #define DQS_FAIL 1
253
254 #define DQS_FIRST_PASS 1
255 #define DQS_SECOND_PASS 2
256
257 #define SB_NORCVREN 11
258 #define RCVREN_MARGIN 6
259 #define SB_SmallRCVR 13
260 #define SB_CHA2BRCVREN 12
261 #define SB_NODQSPOS  14
262 #define MIN_DQS_WNDW 3
263 #define SB_SMALLDQS 15
264
265
266 static unsigned CompareTestPatternQW0(unsigned channel, unsigned addr, unsigned pattern, const uint32_t *TestPattern0, const uint32_t *TestPattern1, const uint32_t *TestPattern2, unsigned Pass, unsigned is_Width128)
267 {
268         uint32_t addr_lo;
269         uint32_t *test_buf;
270         uint32_t value;
271         uint32_t value_test;
272         unsigned result = DQS_FAIL;
273
274         if(Pass == DQS_FIRST_PASS) {
275                 if(pattern==1) {
276                         test_buf = (uint32_t *)TestPattern1;
277                 }
278                 else {
279                         test_buf = (uint32_t *)TestPattern0;
280                 }
281         }
282         else {
283                 test_buf = (uint32_t *)TestPattern2;
284         }
285
286         set_FSBASE(addr>>24);   
287         
288         addr_lo = addr<<8;
289         
290         if(is_Width128 && (channel == 1)) {
291                 addr_lo += 8; //second channel
292                 test_buf += 2;
293         }
294         
295         __asm__ volatile (
296                 "movl %%fs:(%1), %0\n\t"
297                 :"=b"(value): "a" (addr_lo)
298         );
299
300         value_test = *test_buf;
301
302         
303         print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4); 
304         print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : addr_lo = ", addr_lo, " value = ", value, 4); 
305
306         if(value == value_test) {
307                 addr_lo += 4;
308                 test_buf++;
309                 __asm__ volatile (
310                         "movl %%fs:(%1), %0\n\t"
311                         :"=b"(value): "a" (addr_lo)
312                 );
313                 value_test = *test_buf;
314                 print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4);
315                 print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : addr_lo = ", addr_lo, " value = ", value, 4);
316
317                 if(value == value_test){
318                         result =  DQS_PASS;
319                 }
320         }
321         
322         if(Pass == DQS_SECOND_PASS) { // second pass need to be inverted
323                 if(result==DQS_PASS) {
324                         result = DQS_FAIL;
325                 }
326                 else {
327                         result = DQS_PASS;
328                 }
329         }
330
331         return result;
332
333 }
334
335 static void SetMaxAL_RcvrDly(const struct mem_controller *ctrl, unsigned dly) 
336 {
337         uint32_t reg;
338
339         dly += (20-1); // round it
340         dly /= 20; // convert from unit 50ps to 1ns
341         
342         dly += 6;
343
344
345         reg = pci_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
346         reg &= ~(DCH_MaxAsyncLat_MASK <<DCH_MaxAsyncLat_SHIFT);
347         reg |= ((dly - DCH_MaxAsyncLat_BASE) << DCH_MaxAsyncLat_SHIFT);
348         pci_write_config32(ctrl->f2, DRAM_CONFIG_HIGH, reg);
349         
350 }
351
352 /*
353         Set the Target range to WT IO (using an IORR overlapping the already existing 
354         WB dram type). Use IORR0
355 */
356 static void SetTargetWTIO(unsigned addr)
357 {
358         msr_t msr;
359         msr.hi = addr>>24;
360         msr.lo = addr<<8;
361         wrmsr(0xc0010016, msr); //IORR0 BASE
362         
363         msr.hi = 0xff;
364         msr.lo = 0xfc000800;  // 64MB Mask
365         wrmsr(0xc0010017, msr); // IORR0 Mask 
366 }
367
368 static void ResetTargetWTIO(void)
369 {
370         msr_t msr;
371
372         msr.hi = 0;
373         msr.lo = 0;  
374         wrmsr(0xc0010017, msr); // IORR0 Mask
375 }
376
377 static void proc_CLFLUSH(unsigned addr)
378 {
379
380         set_FSBASE(addr>>24);
381
382         /* 1st move causes read fill (to exclusive or shared)*/
383         __asm__ volatile (
384                         /* clflush fs:[eax] */
385                 "clflush %%fs:(%0)\n\t"
386                 ::"a" (addr<<8)
387         );
388         
389 }
390 static void proc_IOCLFLUSH(unsigned addr)
391 {
392         SetTargetWTIO(addr);
393         proc_CLFLUSH(addr);
394         ResetTargetWTIO();
395 }
396
397 static void ResetDCTWrPtr(const struct mem_controller *ctrl)
398 {
399         uint32_t dword;
400         unsigned index = 0x10;
401
402         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
403         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
404
405         index += 0x20;
406         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
407         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
408
409 }
410
411
412 static uint16_t get_exact_T1000(unsigned i)
413 {
414         //                                 200   266,   333,  400
415         static const uint16_t T1000_a[]= { 5000, 3759, 3003, 2500 };
416
417         static const uint16_t TT_a[] = {
418                  /*200   266   333   400 */
419          /*4 */   6250, 6250, 6250, 6250,
420          /*5 */   5000, 5000, 5000, 2500,
421          /*6 */   5000, 4166, 4166, 2500,
422          /*7 */   5000, 4285, 3571, 2500,
423
424          /*8 */   5000, 3750, 3125, 2500,
425          /*9 */   5000, 3888, 3333, 2500,
426          /*10*/   5000, 4000, 3000, 2500,
427          /*11*/   5000, 4090, 3181, 2500,
428
429          /*12*/   5000, 3750, 3333, 2500,
430          /*13*/   5000, 3846, 3076, 2500,
431          /*14*/   5000, 3928, 3214, 2500,
432          /*15*/   5000, 4000, 3000, 2500,
433         };
434
435         unsigned fid_cur;
436         int index;
437
438         msr_t msr;
439         msr = rdmsr(0xc0010042);
440         fid_cur = msr.lo & 0x3f;
441
442         index = fid_cur>>1;
443
444         if(index>12) return T1000_a[i];
445
446         return TT_a[index * 4+i];
447
448 }
449
450 static void InitDQSPos4RcvrEn(const struct mem_controller *ctrl)
451 {
452         int i;
453         uint32_t dword;
454         
455         dword = 0x00000000;
456         for(i=1; i<=3; i++) {
457                 /* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x01-0x03, 0x21-0x23) to 0x00 for all bytes */
458                 pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
459                 pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
460         }
461
462         dword = 0x2f2f2f2f;
463         for(i=5; i<=7; i++) {
464                 /* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x05-0x07, 0x25-0x27) to 0x2f for all bytes */
465                 pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
466                 pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
467         }
468
469
470 }
471 #ifndef K8_REV_F_SUPPORT_F0_F1_WORKAROUND 
472 #define K8_REV_F_SUPPORT_F0_F1_WORKAROUND 1
473 #endif
474
475 static unsigned TrainRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
476 {
477
478         static const uint32_t TestPattern0[] = {
479                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
480                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
481                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
482                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
483                 };
484         static const uint32_t TestPattern1[] = {
485                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
486                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
487                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
488                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
489                 };
490         static const uint32_t TestPattern2[] = { 
491                         0x12345678, 0x87654321, 0x23456789, 0x98765432,
492                         0x59385824, 0x30496724, 0x24490795, 0x99938733,
493                         0x40385642, 0x38465245, 0x29432163, 0x05067894,
494                         0x12349045, 0x98723467, 0x12387634, 0x34587623,
495                 };
496
497         uint8_t pattern_buf_x[64 * 4 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */ 
498         uint8_t *buf_a, *buf_b; 
499         uint32_t ecc_bit;
500         uint32_t dword;
501         uint8_t *dqs_rcvr_dly_a = &sysinfo->dqs_rcvr_dly_a[ctrl->node_id * 2* 8] ; //8 node, channel 2, receiver 8
502
503         int i;
504
505         unsigned channel, receiver;
506
507         unsigned Errors;
508         unsigned CTLRMaxDelay;
509         unsigned T1000;
510
511         unsigned LastTest;
512         unsigned CurrTest;
513         unsigned Test0, Test1;
514
515         unsigned RcvrEnDlyRmin;
516
517         unsigned two_ranks;
518         unsigned RcvrEnDly;
519
520         unsigned PatternA;
521         unsigned PatternB;
522
523         unsigned TestAddr0, TestAddr0B, TestAddr1, TestAddr1B = 0;
524
525         unsigned CurrRcvrCHADelay = 0;
526
527         unsigned tmp;
528
529         unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
530
531         unsigned cpu_f0_f1;
532
533         if(Pass == DQS_FIRST_PASS) {
534                 InitDQSPos4RcvrEn(ctrl);
535         }
536
537         //enable SSE2
538         enable_sse2();
539
540         //wrap32dis
541         set_wrap32dis();
542
543         //disable ECC temp
544         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
545         ecc_bit = dword & DCL_DimmEccEn;
546         dword &= ~(DCL_DimmEccEn); 
547         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
548
549
550         if(Pass == DQS_FIRST_PASS) {
551 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
552         cpu_f0_f1 = is_cpu_pre_f2_in_bsp(ctrl->node_id);
553         if(!cpu_f0_f1) 
554 #endif
555         {
556 #if 1
557                 /* Set the DqsRcvEnTrain bit */
558                 dword = pci_read_config32(ctrl->f2, DRAM_CTRL);
559                 dword |= DC_DqsRcvEnTrain;
560                 pci_write_config32(ctrl->f2, DRAM_CTRL, dword);
561 #endif
562         }
563         }
564
565         //get T1000 figures (cycle time (ns)) * 1K
566         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
567         dword &= DCH_MemClkFreq_MASK;
568
569         T1000 = get_exact_T1000(dword); 
570
571         // SetupRcvrPattern 
572         buf_a = (uint8_t *)(((uint32_t)(&pattern_buf_x[0]) + 0x10) & (0xfffffff0));
573         buf_b = buf_a + 128; //??
574         if(Pass==DQS_FIRST_PASS) {
575                 for(i=0;i<16;i++) {
576                         *((uint32_t *)(buf_a + i*4)) = TestPattern0[i];
577                         *((uint32_t *)(buf_b + i*4)) = TestPattern1[i];
578                 }
579         }
580         else {
581                 for(i=0;i<16;i++) {
582                         *((uint32_t *)(buf_a + i*4)) = TestPattern2[i];
583                         *((uint32_t *)(buf_b + i*4)) = TestPattern2[i];
584                 }
585         }
586
587         print_debug_dqs("\r\nTrainRcvEn: 0 ctrl", ctrl->node_id, 0);
588
589         print_debug_addr("TrainRcvEn: buf_a:", buf_a); 
590
591         Errors = 0;
592         /* for each channel */
593         CTLRMaxDelay = 0;
594         channel = 0;
595
596         if (!(sysinfo->meminfo[ctrl->node_id].dimm_mask & 0x0F) &&
597              (sysinfo->meminfo[ctrl->node_id].dimm_mask & 0xF0)) { /* channelB only? */
598                 channel = 1;
599         }
600
601         for ( ; (channel < 2) && (!Errors); channel++)
602         { 
603                 print_debug_dqs("\tTrainRcvEn51: channel ",channel, 1); 
604                 
605                 /* for each rank */ 
606                 /* there are four recriver pairs, loosely associated with CS */ 
607                 for( receiver = 0; (receiver < 8) && (!Errors); receiver+=2) 
608                 {
609                         
610                         unsigned index=(receiver>>1) * 3 + 0x10;
611
612                         print_debug_dqs("\t\tTrainRcvEn52: index ", index, 2); 
613
614                         if(is_Width128) {
615                                 if(channel) {
616                                         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
617                                         CurrRcvrCHADelay= dword & 0xff;
618                                 }
619                         }
620                         else {
621                                 if(channel) { 
622                                         index += 0x20;
623                                 }
624                         }       
625
626                         LastTest = DQS_FAIL;
627                         RcvrEnDlyRmin = 0xaf;
628                                 
629                         if(!RcvrRankEnabled(ctrl, channel, receiver, is_Width128, sysinfo)) continue;
630
631                         /* for each DQS receiver enable setting */
632         
633                         TestAddr0 = Get_RcvrSysAddr(ctrl, channel, receiver, sysinfo);
634
635                         TestAddr0B = TestAddr0 + (1<<(20+2-8)); // 4MB
636         
637                         if(RcvrRankEnabled(ctrl, channel, receiver+1, is_Width128, sysinfo)) {
638                                 TestAddr1 = Get_RcvrSysAddr(ctrl, channel, receiver+1, sysinfo);
639                                 TestAddr1B = TestAddr1 + (1<<(20+2-8)); //4MB
640                                 two_ranks = 1;
641                         }
642                         else {
643                                 two_ranks = 0;
644                         }
645
646                         print_debug_dqs("\t\tTrainRcvEn53: TestAddr0B ", TestAddr0B, 2); 
647
648                         Write1LTestPattern(TestAddr0, 0, buf_a, buf_b); // rank0 of dimm, test p0
649                         Write1LTestPattern(TestAddr0B, 1, buf_a, buf_b); //rank0 of dimm, test p1
650
651                         if(two_ranks == 1) {
652                                 Write1LTestPattern(TestAddr1, 0, buf_a, buf_b); //rank 1 of dimm
653                                 Write1LTestPattern(TestAddr1B, 1, buf_a, buf_b);//rank 1 of dimm
654                         }
655
656                         if(Pass == DQS_FIRST_PASS) {
657                                 RcvrEnDly = 0; 
658                         } else {
659                                 RcvrEnDly = dqs_rcvr_dly_a[channel * 8 + receiver];
660                         }
661
662                         while ( RcvrEnDly < 0xaf) { // Sweep Delay value here
663                                 print_debug_dqs("\t\t\tTrainRcvEn541: RcvrEnDly ", RcvrEnDly, 3);
664
665                                 if(RcvrEnDly & 1) {
666                                         /* Odd steps get another pattern such that even
667                                            and odd steps alternate.
668                                            The pointers to the patterns will be swapped
669                                            at the end of the loop so they are correspond
670                                         */
671                                         PatternA = 1;
672                                         PatternB = 0;
673                                 }
674                                 else {
675                                         /* Even step */
676                                         PatternA = 0;
677                                         PatternB = 1;
678                                 }
679
680                                 /* Program current Receiver enable delay */
681                                 pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
682                                 /* FIXME: 64bit MUX */
683         
684                                 if(is_Width128) {
685                                         /* Program current Receiver enable delay chaannel b */
686                                         pci_write_config32_index_wait(ctrl->f2, 0x98, index+ 0x20, RcvrEnDly);
687                                 }
688                         
689                                 /* Program the MaxAsyncLat filed with the
690                                    current DQS receiver enable setting plus 6ns
691                                 */      
692                                 /*Porgram MaxAsyncLat to correspond with current delay */
693                                 SetMaxAL_RcvrDly(ctrl, RcvrEnDly);
694
695                                 CurrTest = DQS_FAIL;
696
697                                 Read1LTestPattern(TestAddr0);  //Cache Fill
698                                 /* ROM vs cache compare */
699                                 Test0 = CompareTestPatternQW0(channel, TestAddr0, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
700                                 proc_IOCLFLUSH(TestAddr0);
701
702                                 ResetDCTWrPtr(ctrl);
703
704                                 print_debug_dqs("\t\t\tTrainRcvEn542: Test0 ", Test0, 3); 
705
706                                 if(Test0 == DQS_PASS) {
707
708                                         Read1LTestPattern(TestAddr0B);
709                                         Test1 = CompareTestPatternQW0(channel, TestAddr0B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
710                                         proc_IOCLFLUSH(TestAddr0B);
711
712                                         ResetDCTWrPtr(ctrl);
713
714                                         print_debug_dqs("\t\t\tTrainRcvEn543: Test1 ", Test1, 3); 
715                                         
716                                         if(Test1 == DQS_PASS) {
717                                                 if(two_ranks) {
718                                                         Read1LTestPattern(TestAddr1);
719                                                         Test0 = CompareTestPatternQW0(channel, TestAddr1, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
720                                                         proc_IOCLFLUSH(TestAddr1);
721                                                         ResetDCTWrPtr(ctrl);
722
723                                                         if(Test0 == DQS_PASS) {
724                                                                 Read1LTestPattern(TestAddr1B);
725                                                                 Test1 = CompareTestPatternQW0(channel, TestAddr1B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
726                                                                 proc_IOCLFLUSH(TestAddr1B);
727                                                                 ResetDCTWrPtr(ctrl);
728
729                                                                 if(Test1 == DQS_PASS) {
730                                                                         CurrTest = DQS_PASS;
731                                                                 }
732                                                         } 
733                                                         print_debug_dqs("\t\t\tTrainRcvEn544: Test0 ", Test0, 3); 
734                                                 }
735                                                 else {
736                                                         CurrTest = DQS_PASS;
737                                                 }
738                                         }
739                                 }
740
741                                 print_debug_dqs("\t\t\tTrainRcvEn55: RcvrEnDly ", RcvrEnDly, 3); 
742
743                                 if(CurrTest == DQS_PASS) {
744                                         if(LastTest == DQS_FAIL) {
745                                                 RcvrEnDlyRmin = RcvrEnDly;
746                                                 break;
747                                         }
748                                 }
749                                 
750                                 LastTest = CurrTest;
751                                 
752                                 /* swap the rank 0 pointers */
753                                 tmp = TestAddr0;
754                                 TestAddr0 = TestAddr0B;
755                                 TestAddr0B = tmp;
756
757                                 /* swap the rank 1 pointers */
758                                 tmp = TestAddr1;
759                                 TestAddr1 = TestAddr1B;
760                                 TestAddr1B = tmp;
761
762                                 print_debug_dqs("\t\t\tTrainRcvEn56: RcvrEnDly ", RcvrEnDly, 3); 
763                                 
764                                 RcvrEnDly++;
765                                 
766                         } // while RcvrEnDly
767
768                         print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDly ", RcvrEnDly, 2); 
769
770                         if(RcvrEnDlyRmin == 0xaf) {
771                                 //no passing window
772                                 Errors |= SB_NORCVREN;
773                         }
774
775                         if(Pass == DQS_FIRST_PASS) {
776                                 // We need a better value for DQSPos trainning
777                                 RcvrEnDly = RcvrEnDlyRmin /* + RCVREN_MARGIN * T1000/64/50 */;
778                         } else {
779                                 RcvrEnDly = RcvrEnDlyRmin;
780                         }
781
782                         if(RcvrEnDly > 0xae) {
783                                 //passing window too narrow, too far delayed
784                                 Errors |= SB_SmallRCVR;
785                                 RcvrEnDly = 0xae;
786                         }
787
788                         if(Pass == DQS_SECOND_PASS) { //second pass must average vales
789                                 RcvrEnDly += dqs_rcvr_dly_a[channel * 8 + receiver] /* - (RCVREN_MARGIN * T1000/64/50)*/;
790                                 RcvrEnDly >>= 1;
791                         }
792                 
793                         dqs_rcvr_dly_a[channel * 8 + receiver] = RcvrEnDly; 
794         
795                         //Set final RcvrEnDly for this DIMM and Channel 
796                         pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
797                 
798                         if(is_Width128) {
799                                 pci_write_config32_index_wait(ctrl->f2, 0x98, index+0x20, RcvrEnDly); // channel B
800                                 if(channel) { 
801                                         pci_write_config32_index_wait(ctrl->f2, 0x98, index, CurrRcvrCHADelay);
802                                         if(RcvrEnDly > CurrRcvrCHADelay) {
803                                                 dword = RcvrEnDly - CurrRcvrCHADelay;   
804                                         }
805                                         else {
806                                                 dword = CurrRcvrCHADelay - RcvrEnDly;
807                                         }
808                                         dword *= 50;
809                                         if(dword > T1000) {
810                                                 Errors |= SB_CHA2BRCVREN;
811                                         }
812                                 }
813                         }
814
815                         print_debug_dqs("\t\tTrainRcvEn63: RcvrEnDly ", RcvrEnDly, 2); 
816
817                         if(RcvrEnDly > CTLRMaxDelay) {
818                                 CTLRMaxDelay = RcvrEnDly;
819                         }
820
821                         print_debug_dqs("\t\tTrainRcvEn64: CTLRMaxDelay ", CTLRMaxDelay, 2); 
822                         
823                 } /* receiver */
824         } /* channel */
825
826         print_debug_dqs("\tTrainRcvEn65: CTLRMaxDelay ", CTLRMaxDelay, 1); 
827
828         /* Program the MaxAsysncLat field with the largest DQS Receiver Enable setting */
829         SetMaxAL_RcvrDly(ctrl, CTLRMaxDelay);
830         ResetDCTWrPtr(ctrl);
831
832         //Enable ECC again 
833         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
834         dword &= ~(DCL_DimmEccEn);
835         dword |= ecc_bit;
836         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
837
838         if(Pass == DQS_FIRST_PASS) {
839 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
840         if(!cpu_f0_f1) 
841 #endif
842         {
843                 dword = pci_read_config32(ctrl->f2, DRAM_CTRL);
844                 dword &= ~DC_DqsRcvEnTrain;
845                 pci_write_config32(ctrl->f2, DRAM_CTRL, dword);
846         }
847         }
848
849         //Clear wrap32dis 
850
851         clear_wrap32dis();
852
853         //restore SSE2 setting
854         disable_sse2();
855
856 #if MEM_TRAIN_SEQ != 1  
857         /* We need tidy output for type 1 */
858         #if CONFIG_USE_PRINTK_IN_CAR
859         printk_debug(" CTLRMaxDelay=%02x", CTLRMaxDelay);
860         #else
861         print_debug(" CTLRMaxDelay="); print_debug_hex8(CTLRMaxDelay); 
862         #endif
863 #endif
864
865         return (CTLRMaxDelay==0xae)?1:0;
866
867 }
868
869 #define DQS_READDIR 1
870 #define DQS_WRITEDIR 0
871
872
873 static void SetDQSDelayCSR(const struct mem_controller *ctrl, unsigned channel, unsigned bytelane, unsigned direction, unsigned dqs_delay)
874 { //ByteLane could be 0-8, last is for ECC
875         unsigned index;
876         uint32_t dword;
877         unsigned shift;
878
879         dqs_delay &= 0xff;
880
881         index = (bytelane>>2) + 1 + channel * 0x20 + (direction << 2);
882         shift = bytelane;
883         while(shift>3) {
884                 shift-=4;
885         }
886         shift <<= 3; // 8 bit
887
888         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
889         dword &= ~(0x3f<<shift);
890         dword |= (dqs_delay<<shift);
891         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
892
893 }
894
895 static void SetDQSDelayAllCSR(const struct mem_controller *ctrl, unsigned channel, unsigned direction, unsigned dqs_delay)
896 {
897         unsigned index;
898         uint32_t dword;
899         int i;
900         
901         dword = 0;
902         dqs_delay &= 0xff;
903         for(i=0;i<4;i++) { 
904                 dword |= dqs_delay<<(i*8);
905         }
906
907         index = 1 + channel * 0x20 + direction * 4;
908
909         for(i=0; i<2; i++) {
910                 pci_write_config32_index_wait(ctrl->f2, 0x98, index + i, dword);
911         }
912         
913 }
914
915 static unsigned MiddleDQS(unsigned min_d, unsigned max_d)
916 {
917         unsigned size_d;
918         size_d = max_d-min_d;
919         if(size_d & 1) { //need round up
920                 min_d++;
921         }
922         return ( min_d + (size_d>>1));
923 }
924
925 static  inline void save_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, uint8_t *dqs_delay_a, uint8_t dqs_delay)
926 {
927         dqs_delay_a[channel * 2*9 + direction * 9 + bytelane] = dqs_delay;
928 }
929
930 static void WriteDQSTestPattern(unsigned addr_lo, unsigned pattern , uint8_t *buf_a)
931 {
932         WriteLNTestPattern(addr_lo, buf_a, (pattern+1) * 9);
933 }
934
935 static void ReadL18TestPattern(unsigned addr_lo) 
936 {
937         //set fs and use fs prefix to access the mem
938         __asm__ volatile (
939                 "movl %%fs:-128(%%esi), %%eax\n\t"  //TestAddr cache line
940                 "movl %%fs:-64(%%esi), %%eax\n\t"   //+1
941                 "movl %%fs:(%%esi), %%eax\n\t"  //+2
942                 "movl %%fs:64(%%esi), %%eax\n\t"   //+3
943
944                 "movl %%fs:-128(%%edi), %%eax\n\t"      //+4
945                 "movl %%fs:-64(%%edi), %%eax\n\t"       //+5
946                 "movl %%fs:(%%edi), %%eax\n\t"  //+6
947                 "movl %%fs:64(%%edi), %%eax\n\t"        //+7
948
949                 "movl %%fs:-128(%%ebx), %%eax\n\t"  //+8
950                 "movl %%fs:-64(%%ebx), %%eax\n\t"       //+9
951                 "movl %%fs:(%%ebx), %%eax\n\t"  //+10
952                 "movl %%fs:64(%%ebx), %%eax\n\t"        //+11
953
954                 "movl %%fs:-128(%%ecx), %%eax\n\t"      //+12
955                 "movl %%fs:-64(%%ecx), %%eax\n\t"       //+13
956                 "movl %%fs:(%%ecx), %%eax\n\t"  //+14
957                 "movl %%fs:64(%%ecx), %%eax\n\t"        //+15
958
959                 "movl %%fs:-128(%%edx), %%eax\n\t"      //+16
960                 "movl %%fs:-64(%%edx), %%eax\n\t"       //+17
961
962                 :: "a"(0), "b" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "S"(addr_lo+128), "D"(addr_lo+128+4*64)
963         );
964
965 }
966
967 static void ReadL9TestPattern(unsigned addr_lo) 
968 {
969
970         //set fs and use fs prefix to access the mem
971         __asm__ volatile (
972
973                 "movl %%fs:-128(%%ecx), %%eax\n\t"  //TestAddr cache line
974                 "movl %%fs:-64(%%ecx), %%eax\n\t"   //+1
975                 "movl %%fs:(%%ecx), %%eax\n\t"      //+2
976                 "movl %%fs:64(%%ecx), %%eax\n\t"   //+3
977
978                 "movl %%fs:-128(%%edx), %%eax\n\t"  //+4
979                 "movl %%fs:-64(%%edx), %%eax\n\t"   //+5
980                 "movl %%fs:(%%edx), %%eax\n\t"      //+6
981                 "movl %%fs:64(%%edx), %%eax\n\t"   //+7
982
983                 "movl %%fs:-128(%%ebx), %%eax\n\t"      //+8
984
985                 :: "a"(0), "b" (addr_lo+128+8*64), "c"(addr_lo+128), "d"(addr_lo+128+4*64) 
986         );
987
988 }
989
990
991 static void ReadDQSTestPattern(unsigned addr_lo, unsigned pattern)
992 {
993         if(pattern == 0) {
994                 ReadL9TestPattern(addr_lo);
995         }
996         else {
997                 ReadL18TestPattern(addr_lo);
998         }
999 }
1000
1001 static void FlushDQSTestPattern_L9(unsigned addr_lo)
1002 {
1003         __asm__ volatile (
1004                 "clflush %%fs:-128(%%ecx)\n\t"
1005                 "clflush %%fs:-64(%%ecx)\n\t"
1006                 "clflush %%fs:(%%ecx)\n\t"
1007                 "clflush %%fs:64(%%ecx)\n\t"
1008
1009                 "clflush %%fs:-128(%%eax)\n\t"
1010                 "clflush %%fs:-64(%%eax)\n\t"
1011                 "clflush %%fs:(%%eax)\n\t"
1012                 "clflush %%fs:64(%%eax)\n\t"
1013
1014                 "clflush %%fs:-128(%%ebx)\n\t"
1015
1016                 ::  "b" (addr_lo+128+8*64), "c"(addr_lo+128), "a"(addr_lo+128+4*64)
1017         );
1018
1019 }
1020 static __attribute__((noinline)) void FlushDQSTestPattern_L18(unsigned addr_lo)
1021 {
1022        __asm__ volatile (
1023                 "clflush %%fs:-128(%%eax)\n\t"
1024                 "clflush %%fs:-64(%%eax)\n\t"
1025                 "clflush %%fs:(%%eax)\n\t"
1026                 "clflush %%fs:64(%%eax)\n\t"
1027
1028                 "clflush %%fs:-128(%%edi)\n\t"
1029                 "clflush %%fs:-64(%%edi)\n\t"
1030                 "clflush %%fs:(%%edi)\n\t"
1031                 "clflush %%fs:64(%%edi)\n\t"
1032
1033                 "clflush %%fs:-128(%%ebx)\n\t"
1034                 "clflush %%fs:-64(%%ebx)\n\t"
1035                 "clflush %%fs:(%%ebx)\n\t"
1036                 "clflush %%fs:64(%%ebx)\n\t"
1037
1038                 "clflush %%fs:-128(%%ecx)\n\t"
1039                 "clflush %%fs:-64(%%ecx)\n\t"
1040                 "clflush %%fs:(%%ecx)\n\t"
1041                 "clflush %%fs:64(%%ecx)\n\t"
1042
1043                 "clflush %%fs:-128(%%edx)\n\t"
1044                 "clflush %%fs:-64(%%edx)\n\t"
1045
1046                 :: "b" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "a"(addr_lo+128), "D"(addr_lo+128+4*64)
1047         );
1048 }
1049
1050 static void FlushDQSTestPattern(unsigned addr_lo, unsigned pattern )
1051 {
1052         
1053         if(pattern == 0){
1054                 FlushDQSTestPattern_L9(addr_lo);
1055         }
1056         else {
1057                 FlushDQSTestPattern_L18(addr_lo);
1058         }
1059 }
1060
1061 static unsigned CompareDQSTestPattern(unsigned channel, unsigned addr_lo, unsigned pattern, uint8_t *buf_a)
1062 {
1063         uint32_t *test_buf;
1064         unsigned bitmap = 0xff;
1065         unsigned bytelane;
1066         int i;
1067         uint32_t value;
1068         int j;
1069         uint32_t value_test;
1070
1071         test_buf = (uint32_t *)buf_a;
1072         
1073
1074         if(pattern && channel) {
1075                 addr_lo += 8; //second channel
1076                 test_buf+= 2;
1077         }
1078
1079         bytelane = 0;
1080         for(i=0;i<9*64/4;i++) {
1081                 __asm__ volatile (
1082                         "movl %%fs:(%1), %0\n\t"
1083                         :"=b"(value): "a" (addr_lo)
1084                 );
1085                 value_test = *test_buf;
1086
1087                 print_debug_dqs_pair("\t\t\t\t\t\ttest_buf= ", (unsigned)test_buf, " value = ", value_test, 7); 
1088                 print_debug_dqs_pair("\t\t\t\t\t\ttaddr_lo = ",addr_lo, " value = ", value, 7);
1089
1090                 for(j=0;j<4*8;j+=8) {
1091                         if(((value>>j)&0xff) != ((value_test>>j)& 0xff)) {
1092                                 bitmap &= ~(1<<bytelane);
1093                         }
1094                 
1095                         bytelane++;
1096                         bytelane &= 0x7; 
1097                 }
1098                 print_debug_dqs("\t\t\t\t\t\tbitmap = ", bitmap, 7);  
1099
1100                 if(bytelane == 0) {
1101                         if(pattern == 1) { //dual channel 
1102                                 addr_lo += 8; //skip over other channel's data
1103                                 test_buf += 2;
1104                         }
1105                 }
1106                 addr_lo += 4;
1107                 test_buf +=1;
1108                 
1109         }
1110
1111
1112         return bitmap;
1113
1114 }
1115
1116 static unsigned TrainDQSPos(const struct mem_controller *ctrl, unsigned channel, unsigned Direction, unsigned Pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1117 {
1118         unsigned ByteLane;
1119         unsigned Errors;
1120         unsigned BanksPresent;
1121
1122         unsigned MutualCSPassW[48];     
1123
1124         unsigned ChipSel;
1125         unsigned DQSDelay;
1126         
1127         unsigned TestAddr;
1128
1129         unsigned LastTest;
1130         unsigned RnkDlyFilterMax, RnkDlyFilterMin = 0;
1131         unsigned RnkDlySeqPassMax, RnkDlySeqPassMin = 0;
1132
1133         Errors = 0;
1134         BanksPresent = 0;
1135
1136         print_debug_dqs("\t\t\tTrainDQSPos begin ", 0, 3);
1137
1138         print_debug_addr("TrainDQSPos: MutualCSPassW[48] :", MutualCSPassW);
1139
1140         for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
1141                 MutualCSPassW[DQSDelay] = 0xff; // Bitmapped status per delay setting, 0xff=All positions passing (1= PASS)
1142         }
1143
1144         for(ChipSel = 0; ChipSel < 8; ChipSel++) { //logical register chipselects 0..7
1145                 print_debug_dqs("\t\t\t\tTrainDQSPos: 11 ChipSel ", ChipSel, 4); 
1146                 //FIXME: process 64MUXedMode
1147                 if(!ChipSelPresent(ctrl, ChipSel, sysinfo)) continue;
1148                 BanksPresent  = 1;
1149
1150                 TestAddr = Get_MCTSysAddr(ctrl, ChipSel, sysinfo);
1151
1152                 print_debug_dqs("\t\t\t\tTrainDQSPos: 12 TestAddr ", TestAddr, 4); 
1153
1154                 //set fs and use fs prefix to access the mem
1155                 set_FSBASE(TestAddr>>24);
1156
1157                 if(Direction == DQS_READDIR) {
1158                         print_debug_dqs("\t\t\t\tTrainDQSPos: 13 for read so write at first", 0, 4);
1159                         WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a);
1160                 }
1161
1162                 for(DQSDelay = 0; DQSDelay < 48; DQSDelay++ ){
1163                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 141 DQSDelay ", DQSDelay, 5); 
1164                         if(MutualCSPassW[DQSDelay] == 0) continue; //skip current delay value if other chipselects have failed all 8 bytelanes
1165                         SetDQSDelayAllCSR(ctrl, channel, Direction, DQSDelay);
1166                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 142 MutualCSPassW ", MutualCSPassW[DQSDelay], 5); 
1167                         if(Direction == DQS_WRITEDIR) {
1168                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 143 for write", 0, 5);
1169                                 WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a); 
1170                         }
1171                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 144 Pattern ", Pattern, 5);
1172                         ReadDQSTestPattern(TestAddr<<8, Pattern); 
1173                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 145 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1174                         MutualCSPassW[DQSDelay] &= CompareDQSTestPattern(channel, TestAddr<<8, Pattern, buf_a); //0: fail, 1=pass
1175                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 146 MutualCSPassW ", MutualCSPassW[DQSDelay], 5); 
1176                         SetTargetWTIO(TestAddr);
1177                         FlushDQSTestPattern(TestAddr<<8, Pattern); 
1178                         ResetTargetWTIO();
1179                 }
1180         }
1181
1182         if(BanksPresent) 
1183         for(ByteLane = 0; ByteLane < 8; ByteLane++) {
1184                 print_debug_dqs("\t\t\t\tTrainDQSPos: 31 ByteLane ",ByteLane, 4); 
1185
1186                 LastTest = DQS_FAIL;
1187                 RnkDlySeqPassMax = 0;
1188                 RnkDlyFilterMax = 0;
1189                 RnkDlyFilterMin = 0;
1190                 for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
1191                         if(MutualCSPassW[DQSDelay] & (1<<ByteLane)) {
1192
1193                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 321 DQSDelay ", DQSDelay, 5); 
1194                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 322 MutualCSPassW ", MutualCSPassW[DQSDelay], 5); 
1195
1196                                 RnkDlySeqPassMax = DQSDelay;
1197                                 if(LastTest == DQS_FAIL) {
1198                                         RnkDlySeqPassMin = DQSDelay; //start sequential run
1199                                 }
1200                                 if((RnkDlySeqPassMax - RnkDlySeqPassMin)>(RnkDlyFilterMax-RnkDlyFilterMin)){
1201                                         RnkDlyFilterMin = RnkDlySeqPassMin;
1202                                         RnkDlyFilterMax = RnkDlySeqPassMax;
1203                                 }
1204                                 LastTest = DQS_PASS;
1205                         }
1206                         else {
1207                                 LastTest = DQS_FAIL;
1208                         }
1209                 }
1210                 print_debug_dqs("\t\t\t\tTrainDQSPos: 33 RnkDlySeqPassMax ", RnkDlySeqPassMax, 4); 
1211
1212                 if(RnkDlySeqPassMax == 0) {
1213                         Errors |= SB_NODQSPOS; // no passing window
1214                 }
1215                 else {
1216                         print_debug_dqs("\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMax ", RnkDlyFilterMax, 4); 
1217                         print_debug_dqs("\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMin ", RnkDlyFilterMin, 4); 
1218                         if((RnkDlyFilterMax - RnkDlyFilterMin)< MIN_DQS_WNDW){
1219                                 Errors |= SB_SMALLDQS;
1220                         }
1221                         else {
1222                                 unsigned middle_dqs;
1223                                 middle_dqs = MiddleDQS(RnkDlyFilterMin, RnkDlyFilterMax); 
1224                                 print_debug_dqs("\t\t\t\tTrainDQSPos: 35 middle_dqs ",middle_dqs, 4); 
1225                                 SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, middle_dqs);
1226                                 save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, middle_dqs);
1227                         }
1228                 }       
1229
1230         }
1231
1232         print_debug_dqs("\t\t\tTrainDQSPos: end", 0xff, 3);
1233         
1234         return Errors;
1235         
1236
1237 }
1238
1239 static unsigned TrainReadDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1240 {
1241         print_debug_dqs("\t\tTrainReadPos", 0, 2); 
1242         return TrainDQSPos(ctrl, channel, DQS_READDIR, pattern, buf_a, dqs_delay_a, sysinfo);   
1243 }
1244
1245 static unsigned TrainWriteDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1246 {
1247         print_debug_dqs("\t\tTrainWritePos", 0, 2);
1248         return TrainDQSPos(ctrl, channel, DQS_WRITEDIR, pattern, buf_a, dqs_delay_a, sysinfo);
1249 }
1250
1251
1252
1253 static unsigned TrainDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1254 {
1255         static const uint32_t TestPatternJD1a[] = {
1256                                         0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW0-1, ALL-EVEN
1257                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW2-3, ALL-EVEN
1258                                         0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW4-5, ALL-EVEN
1259                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW6-7, ALL-EVEN
1260                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW0-1, DQ0-ODD
1261                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW2-3, DQ0-ODD
1262                                         0x01010101,0x01010101,0xFeFeFeFe,0xFeFeFeFe, // QW4-5, DQ0-ODD
1263                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW6-7, DQ0-ODD
1264                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW0-1, DQ1-ODD
1265                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2-3, DQ1-ODD
1266                                         0xFdFdFdFd,0xFdFdFdFd,0x02020202,0x02020202, // QW4-5, DQ1-ODD
1267                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW6-7, DQ1-ODD
1268                                         0x04040404,0x04040404,0xfBfBfBfB,0xfBfBfBfB, // QW0-1, DQ2-ODD
1269                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW2-3, DQ2-ODD
1270                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4-5, DQ2-ODD
1271                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6-7, DQ2-ODD
1272                                         0x08080808,0x08080808,0xF7F7F7F7,0xF7F7F7F7, // QW0-1, DQ3-ODD
1273                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW2-3, DQ3-ODD
1274                                         0xF7F7F7F7,0xF7F7F7F7,0x08080808,0x08080808, // QW4-5, DQ3-ODD
1275                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6-7, DQ3-ODD
1276                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW0-1, DQ4-ODD
1277                                         0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW2-3, DQ4-ODD
1278                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4-5, DQ4-ODD
1279                                         0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW6-7, DQ4-ODD
1280                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0-1, DQ5-ODD
1281                                         0xdFdFdFdF,0xdFdFdFdF,0x20202020,0x20202020, // QW2-3, DQ5-ODD
1282                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4-5, DQ5-ODD
1283                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6-7, DQ5-ODD
1284                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0-1, DQ6-ODD
1285                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW2-3, DQ6-ODD
1286                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW4-5, DQ6-ODD
1287                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW6-7, DQ6-ODD
1288                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW0-1, DQ7-ODD
1289                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW2-3, DQ7-ODD
1290                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW4-5, DQ7-ODD
1291                                         0x80808080,0x80808080,0x80808080,0x80808080  // QW6-7, DQ7-ODD
1292                 };
1293         static const uint32_t TestPatternJD1b[] = {
1294                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW0,CHA-B, ALL-EVEN
1295                                         0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW1,CHA-B, ALL-EVEN
1296                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW2,CHA-B, ALL-EVEN
1297                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW3,CHA-B, ALL-EVEN
1298                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW4,CHA-B, ALL-EVEN
1299                                         0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW5,CHA-B, ALL-EVEN
1300                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW6,CHA-B, ALL-EVEN
1301                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW7,CHA-B, ALL-EVEN
1302                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW0,CHA-B, DQ0-ODD
1303                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW1,CHA-B, DQ0-ODD
1304                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW2,CHA-B, DQ0-ODD
1305                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW3,CHA-B, DQ0-ODD
1306                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW4,CHA-B, DQ0-ODD
1307                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW5,CHA-B, DQ0-ODD
1308                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW6,CHA-B, DQ0-ODD
1309                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW7,CHA-B, DQ0-ODD
1310                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW0,CHA-B, DQ1-ODD
1311                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW1,CHA-B, DQ1-ODD
1312                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2,CHA-B, DQ1-ODD
1313                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW3,CHA-B, DQ1-ODD
1314                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW4,CHA-B, DQ1-ODD
1315                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW5,CHA-B, DQ1-ODD
1316                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW6,CHA-B, DQ1-ODD
1317                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW7,CHA-B, DQ1-ODD
1318                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW0,CHA-B, DQ2-ODD
1319                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW1,CHA-B, DQ2-ODD
1320                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW2,CHA-B, DQ2-ODD
1321                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW3,CHA-B, DQ2-ODD
1322                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4,CHA-B, DQ2-ODD
1323                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW5,CHA-B, DQ2-ODD
1324                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6,CHA-B, DQ2-ODD
1325                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW7,CHA-B, DQ2-ODD
1326                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW0,CHA-B, DQ3-ODD
1327                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW1,CHA-B, DQ3-ODD
1328                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW2,CHA-B, DQ3-ODD
1329                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW3,CHA-B, DQ3-ODD
1330                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW4,CHA-B, DQ3-ODD
1331                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW5,CHA-B, DQ3-ODD
1332                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6,CHA-B, DQ3-ODD
1333                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW7,CHA-B, DQ3-ODD
1334                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW0,CHA-B, DQ4-ODD
1335                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW1,CHA-B, DQ4-ODD
1336                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW2,CHA-B, DQ4-ODD
1337                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW3,CHA-B, DQ4-ODD
1338                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4,CHA-B, DQ4-ODD
1339                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW5,CHA-B, DQ4-ODD
1340                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW6,CHA-B, DQ4-ODD
1341                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW7,CHA-B, DQ4-ODD
1342                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0,CHA-B, DQ5-ODD
1343                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW1,CHA-B, DQ5-ODD
1344                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW2,CHA-B, DQ5-ODD
1345                                         0x20202020,0x20202020,0x20202020,0x20202020, // QW3,CHA-B, DQ5-ODD
1346                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4,CHA-B, DQ5-ODD
1347                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW5,CHA-B, DQ5-ODD
1348                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6,CHA-B, DQ5-ODD
1349                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW7,CHA-B, DQ5-ODD
1350                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0,CHA-B, DQ6-ODD
1351                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW1,CHA-B, DQ6-ODD
1352                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW2,CHA-B, DQ6-ODD
1353                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW3,CHA-B, DQ6-ODD
1354                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW4,CHA-B, DQ6-ODD
1355                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW5,CHA-B, DQ6-ODD
1356                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW6,CHA-B, DQ6-ODD
1357                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW7,CHA-B, DQ6-ODD
1358                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW0,CHA-B, DQ7-ODD
1359                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW1,CHA-B, DQ7-ODD
1360                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW2,CHA-B, DQ7-ODD
1361                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW3,CHA-B, DQ7-ODD
1362                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW4,CHA-B, DQ7-ODD
1363                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW5,CHA-B, DQ7-ODD
1364                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW6,CHA-B, DQ7-ODD
1365                                         0x80808080,0x80808080,0x80808080,0x80808080  // QW7,CHA-B, DQ7-ODD
1366                 };
1367         uint8_t pattern_buf_x[64 * 18 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */
1368         uint8_t *buf_a;
1369
1370         unsigned pattern;
1371         uint32_t dword;
1372         uint32_t ecc_bit;
1373         unsigned Errors;
1374         unsigned channel;
1375         int i;
1376         unsigned DQSWrDelay;
1377         unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
1378         uint8_t *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
1379
1380         //enable SSE2
1381         enable_sse2();
1382
1383         //wrap32dis
1384         set_wrap32dis();
1385
1386         //disable ECC temp
1387         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
1388         ecc_bit = dword & DCL_DimmEccEn;
1389         dword &= ~(DCL_DimmEccEn);
1390         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
1391
1392         //SetupDqsPattern
1393         buf_a = (uint8_t *)(((uint32_t)(&pattern_buf_x[0]) + 0x10) & (~0xf));
1394
1395         if(is_Width128){
1396                 pattern = 1;
1397                 for(i=0;i<16*18;i++) {
1398                         *((uint32_t *)(buf_a + i*4)) = TestPatternJD1b[i];
1399                  }
1400         }
1401         else {
1402                 pattern = 0;
1403                 for(i=0; i<16*9;i++) {
1404                         *((uint32_t *)(buf_a + i*4)) = TestPatternJD1a[i];
1405                 }
1406                 
1407         }
1408
1409         print_debug_dqs("\r\nTrainDQSRdWrPos: 0 ctrl ", ctrl->node_id, 0); 
1410
1411         print_debug_addr("TrainDQSRdWrPos: buf_a:", buf_a);
1412
1413         Errors = 0;
1414         channel = 0;
1415
1416         if (!(sysinfo->meminfo[ctrl->node_id].dimm_mask & 0x0F) &&
1417              (sysinfo->meminfo[ctrl->node_id].dimm_mask & 0xF0)) { /* channelB only? */
1418                 channel = 1;
1419         }
1420
1421         while( (channel<2) && (!Errors)) {
1422                 print_debug_dqs("\tTrainDQSRdWrPos: 1 channel ",channel, 1); 
1423                 for(DQSWrDelay = 0; DQSWrDelay < 48; DQSWrDelay++) {
1424                         unsigned err;
1425                         SetDQSDelayAllCSR(ctrl, channel, DQS_WRITEDIR, DQSWrDelay);
1426                         print_debug_dqs("\t\tTrainDQSRdWrPos: 21 DQSWrDelay ", DQSWrDelay, 2); 
1427                         err= TrainReadDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
1428                         print_debug_dqs("\t\tTrainDQSRdWrPos: 22 err ",err, 2); 
1429                         if(err == 0) break;
1430                         Errors |= err;
1431                 }
1432
1433                 print_debug_dqs("\tTrainDQSRdWrPos: 3 DQSWrDelay ", DQSWrDelay, 1); 
1434
1435                 if(DQSWrDelay < 48) {
1436                         Errors = TrainWriteDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
1437                         print_debug_dqs("\tTrainDQSRdWrPos: 4 Errors ", Errors, 1); 
1438
1439                 }
1440                 channel++;
1441                 if(!is_Width128){
1442                         //FIXME: 64MuxMode??    
1443                         channel++; // skip channel if 64-bit mode
1444                 }
1445         }
1446
1447         //Enable ECC again
1448         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
1449         dword &= ~(DCL_DimmEccEn);
1450         dword |= ecc_bit;
1451         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
1452
1453         //Clear wrap32dis
1454
1455         clear_wrap32dis();
1456
1457         //restore SSE2 setting
1458         disable_sse2();
1459
1460         print_debug_dqs("TrainDQSRdWrPos: ", 5, 0); 
1461         
1462         return Errors;
1463
1464 }
1465 static inline uint8_t get_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, uint8_t *dqs_delay_a)
1466 {
1467         return dqs_delay_a[channel * 2*9 + direction * 9 + bytelane];
1468 }
1469
1470 static unsigned CalcEccDQSPos(unsigned channel,unsigned ByteLane0, unsigned ByteLane1, unsigned InterFactor, unsigned Direction, uint8_t *dqs_delay_a)
1471 /* InterFactor: 0: 100% ByteLane 0
1472                 0x80: 50% between ByteLane 0 and 1
1473                 0xff: 99.6% ByteLane 1 and 0.4% like 0
1474 */
1475 {
1476         unsigned DQSDelay0, DQSDelay1;
1477         unsigned DQSDelay;
1478         
1479         DQSDelay0 = get_dqs_delay(channel, ByteLane0, Direction, dqs_delay_a);
1480         DQSDelay1 = get_dqs_delay(channel, ByteLane1, Direction, dqs_delay_a); 
1481         
1482         if(DQSDelay0>DQSDelay1) {
1483                 DQSDelay = DQSDelay0 - DQSDelay1;
1484                 InterFactor = 0xff - InterFactor;
1485         }
1486         else {
1487                 DQSDelay = DQSDelay1 - DQSDelay0;
1488         }
1489
1490         DQSDelay *= InterFactor;
1491
1492         DQSDelay >>= 8; // /255
1493
1494         if(DQSDelay0>DQSDelay1) {
1495                 DQSDelay += DQSDelay1;
1496         }
1497         else {
1498                 DQSDelay += DQSDelay0;
1499         }
1500
1501         return DQSDelay;
1502
1503 }
1504
1505 static void SetEccDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1506 {       
1507         unsigned channel;
1508         unsigned ByteLane;
1509         unsigned Direction;
1510         unsigned lane0, lane1, ratio;
1511         unsigned dqs_delay;
1512
1513         unsigned direction[] = { DQS_READDIR, DQS_WRITEDIR };
1514         int i;
1515         uint8_t *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
1516
1517         ByteLane = 8;
1518
1519         for(channel = 0; channel < 2; channel++) {
1520                 for(i=0;i<2;i++) {
1521                         Direction = direction[i];
1522                         lane0 = 4; lane1 = 5; ratio = 0;
1523                         dqs_delay = CalcEccDQSPos(channel, lane0, lane1, ratio, Direction, dqs_delay_a);
1524                         print_debug_dqs_pair("\t\tSetEccDQSRdWrPos: channel ", channel, Direction==DQS_READDIR? " R dqs_delay":" W dqs_delay",  dqs_delay, 2); 
1525                         SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, dqs_delay);
1526                         save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, dqs_delay);
1527                 }
1528         }
1529 }
1530
1531 static unsigned train_DqsRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
1532 {
1533         print_debug_dqs("\r\ntrain_DqsRcvrEn: begin ctrl ", ctrl->node_id, 0); 
1534         if(TrainRcvrEn(ctrl, Pass, sysinfo)) {
1535                 return 1;
1536         }
1537         print_debug_dqs("\r\ntrain_DqsRcvrEn: end ctrl ", ctrl->node_id, 0); 
1538         return 0;
1539         
1540 }
1541 static unsigned train_DqsPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1542 {
1543         print_debug_dqs("\r\ntrain_DqsPos: begin ctrl ", ctrl->node_id, 0); 
1544         if(TrainDQSRdWrPos(ctrl, sysinfo) != 0) {
1545                 print_err("\r\nDQS Training Rd Wr failed ctrl"); print_err_hex8(ctrl->node_id); print_err("\r\n");
1546                 return 1;
1547         }
1548         else {
1549                 SetEccDQSRdWrPos(ctrl, sysinfo);
1550         }
1551         print_debug_dqs("\r\ntrain_DqsPos: end ctrl ", ctrl->node_id, 0); 
1552         return 0;
1553         
1554 }
1555
1556 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1557 static void f0_svm_workaround(int controllers, const struct mem_controller *ctrl, tsc_t *tsc0, struct sys_info *sysinfo)
1558 {
1559         tsc_t tsc1[8];
1560         unsigned cpu_f0_f1[8];
1561         int i;
1562
1563         print_debug_addr("dqs_timing: tsc1[8] :", tsc1);
1564
1565         for(i = 0; i < controllers; i++) {
1566                 if (!sysinfo->ctrl_present[i])
1567                         continue;
1568
1569                 /* Skip everything if I don't have any memory on this controller */
1570                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1571
1572                 uint32_t dword;
1573
1574                 cpu_f0_f1[i] = is_cpu_pre_f2_in_bsp(i);
1575
1576                 if(!cpu_f0_f1[i]) continue;
1577
1578                 dword = pci_read_config32(ctrl[i].f2, DRAM_CTRL);
1579                 dword &= ~DC_DqsRcvEnTrain;
1580                 pci_write_config32(ctrl[i].f2, DRAM_CTRL, dword);
1581
1582                 dword = pci_read_config32(ctrl[i].f2, DRAM_INIT);
1583                 dword |= DI_EnDramInit;
1584                 pci_write_config32(ctrl[i].f2, DRAM_INIT, dword);
1585                 dword &= ~DI_EnDramInit;
1586                 pci_write_config32(ctrl[i].f2, DRAM_INIT, dword);
1587
1588                 tsc1[i] = rdtsc();
1589                 print_debug_dqs_tsc("begin: tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
1590
1591                 dword = tsc1[i].lo + tsc0[i].lo;
1592                 if((dword<tsc1[i].lo) || (dword<tsc0[i].lo)) {
1593                         tsc1[i].hi++;
1594                 }
1595                 tsc1[i].lo = dword;
1596                 tsc1[i].hi+= tsc0[i].hi;
1597
1598                 print_debug_dqs_tsc("end  : tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
1599
1600         }
1601
1602         for(i = 0; i < controllers; i++) {
1603                 if (!sysinfo->ctrl_present[i])
1604                         continue;
1605
1606                 /* Skip everything if I don't have any memory on this controller */
1607                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1608
1609                 if(!cpu_f0_f1[i]) continue;
1610
1611                 tsc_t tsc;
1612
1613                 do {
1614                         tsc = rdtsc();
1615                 } while ((tsc1[i].hi>tsc.hi) || ((tsc1[i].hi==tsc.hi) && (tsc1[i].lo>tsc.lo)));
1616
1617                 print_debug_dqs_tsc("end  : tsc ", i, tsc.hi, tsc.lo, 2);
1618         }
1619
1620 }
1621
1622 #endif
1623
1624
1625 /* setting variable mtrr, comes from linux kernel source */
1626 static void set_var_mtrr_dqs(
1627         unsigned int reg, unsigned long basek, unsigned long sizek,
1628         unsigned char type, unsigned address_bits)
1629 {
1630         msr_t base, mask;
1631         unsigned address_mask_high;
1632
1633         address_mask_high = ((1u << (address_bits - 32u)) - 1u);
1634
1635         base.hi = basek >> 22;
1636         base.lo  = basek << 10;
1637
1638         if (sizek < 4*1024*1024) {
1639                 mask.hi = address_mask_high;
1640                 mask.lo = ~((sizek << 10) -1);
1641         }
1642         else {
1643                 mask.hi = address_mask_high & (~((sizek >> 22) -1));
1644                 mask.lo = 0;
1645         }
1646
1647         if (reg >= 8)
1648                 return;
1649
1650         if (sizek == 0) {
1651                 msr_t zero;
1652                 zero.lo = zero.hi = 0;
1653                 /* The invalid bit is kept in the mask, so we simply clear the
1654                    relevant mask register to disable a range. */
1655                 wrmsr (MTRRphysMask_MSR(reg), zero);
1656         } else {
1657                 /* Bit 32-35 of MTRRphysMask should be set to 1 */
1658                 base.lo |= type;
1659                 mask.lo |= 0x800;
1660                 wrmsr (MTRRphysBase_MSR(reg), base);
1661                 wrmsr (MTRRphysMask_MSR(reg), mask);
1662         }
1663 }
1664
1665
1666 /* fms: find most sigificant bit set, stolen from Linux Kernel Source. */
1667 static inline unsigned int fms(unsigned int x)
1668 {
1669         int r;
1670
1671         __asm__("bsrl %1,%0\n\t"
1672                 "jnz 1f\n\t"
1673                 "movl $0,%0\n"
1674                 "1:" : "=r" (r) : "g" (x));
1675         return r;
1676 }
1677
1678 /* fms: find least sigificant bit set */
1679 static inline unsigned int fls(unsigned int x)
1680 {
1681         int r;
1682
1683         __asm__("bsfl %1,%0\n\t"
1684                 "jnz 1f\n\t"
1685                 "movl $32,%0\n"
1686                 "1:" : "=r" (r) : "g" (x));
1687         return r;
1688 }
1689
1690 static unsigned int range_to_mtrr(unsigned int reg,
1691         unsigned long range_startk, unsigned long range_sizek,
1692         unsigned long next_range_startk, unsigned char type, unsigned address_bits)
1693 {
1694         if (!range_sizek || (reg >= 8)) {
1695                 return reg;
1696         }
1697         while(range_sizek) {
1698                 unsigned long max_align, align;
1699                 unsigned long sizek;
1700                 /* Compute the maximum size I can make a range */
1701                 max_align = fls(range_startk);
1702                 align = fms(range_sizek);
1703                 if (align > max_align) {
1704                         align = max_align;
1705                 }
1706                 sizek = 1 << align;
1707 #if MEM_TRAIN_SEQ != 1
1708         #if CONFIG_USE_PRINTK_IN_CAR
1709                 printk_debug("Setting variable MTRR %d, base: %4dMB, range: %4dMB, type %s\r\n",
1710                         reg, range_startk >>10, sizek >> 10,
1711                         (type==MTRR_TYPE_UNCACHEABLE)?"UC":
1712                             ((type==MTRR_TYPE_WRBACK)?"WB":"Other")
1713                         );
1714         #else
1715                 print_debug("Setting variable MTRR "); print_debug_hex8(reg); print_debug(", base: "); print_debug_hex16(range_startk>>10); 
1716                         print_debug("MB, range: "); print_debug_hex16(sizek >> 10); print_debug("MB, type "); 
1717                         print_debug( (type==MTRR_TYPE_UNCACHEABLE)?"UC\r\n":
1718                                       ((type==MTRR_TYPE_WRBACK)?"WB\r\n":"Other\r\n")
1719                                    );
1720         #endif
1721 #endif
1722                 set_var_mtrr_dqs(reg++, range_startk, sizek, type, address_bits);
1723                 range_startk += sizek;
1724                 range_sizek -= sizek;
1725                 if (reg >= 8)
1726                         break;
1727         }
1728         return reg;
1729 }
1730
1731 static void set_top_mem_ap(unsigned tom_k, unsigned tom2_k)
1732 {
1733         msr_t msr;
1734
1735         /* Now set top of memory */
1736         msr.lo = (tom2_k & 0x003fffff) << 10;
1737         msr.hi = (tom2_k & 0xffc00000) >> 22;
1738         wrmsr(TOP_MEM2, msr);
1739
1740         msr.lo = (tom_k & 0x003fffff) << 10;
1741         msr.hi = (tom_k & 0xffc00000) >> 22;
1742         wrmsr(TOP_MEM, msr);
1743 }
1744
1745 static void setup_mtrr_dqs(unsigned tom_k, unsigned tom2_k){
1746         unsigned reg;
1747         msr_t msr;
1748
1749 #if 0
1750         //still enable from cache_as_ram.inc
1751         msr = rdmsr(SYSCFG_MSR);
1752         msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
1753         wrmsr(SYSCFG_MSR,msr);
1754 #endif
1755
1756         //[0,512k), [512k, 640k)
1757         msr.hi = 0x1e1e1e1e;
1758         msr.lo = msr.hi;
1759         wrmsr(0x250, msr);
1760         wrmsr(0x258, msr);
1761
1762         //[1M, TOM)
1763         reg = range_to_mtrr(2, 0, tom_k,4*1024*1024, MTRR_TYPE_WRBACK, 40);
1764
1765         //[4G, TOM2)
1766         if(tom2_k) {
1767                 //enable tom2 and type
1768                 msr = rdmsr(SYSCFG_MSR);
1769                 msr.lo |= (1<<21) | (1<<22); //MtrrTom2En and Tom2ForceMemTypeWB
1770                 wrmsr(SYSCFG_MSR, msr);
1771         }
1772
1773 }
1774
1775 static void clear_mtrr_dqs(unsigned tom2_k){
1776         msr_t msr;
1777         unsigned i;
1778
1779         //still enable from cache_as_ram.inc
1780         msr = rdmsr(SYSCFG_MSR);
1781         msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
1782         wrmsr(SYSCFG_MSR,msr);
1783
1784         //[0,512k), [512k, 640k)
1785         msr.hi = 0;
1786         msr.lo = msr.hi;
1787         wrmsr(0x250, msr);
1788         wrmsr(0x258, msr);
1789
1790         //[1M, TOM)
1791         for(i=0x204;i<0x210;i++) {
1792                 wrmsr(i, msr);
1793         }
1794
1795         //[4G, TOM2)
1796         if(tom2_k) {
1797                 //enable tom2 and type
1798                 msr = rdmsr(SYSCFG_MSR);
1799                 msr.lo &= ~((1<<21) | (1<<22)); //MtrrTom2En and Tom2ForceMemTypeWB
1800                 wrmsr(SYSCFG_MSR, msr);
1801         }
1802 }
1803
1804 static void set_htic_bit(unsigned i, unsigned val, unsigned bit)
1805 {
1806         uint32_t dword;
1807         dword = pci_read_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL);
1808         dword &= ~(1<<bit);
1809         dword |= ((val & 1) <<bit);
1810         pci_write_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL, dword);
1811 }
1812
1813
1814 static unsigned get_htic_bit(unsigned i, unsigned bit)
1815 {
1816         uint32_t dword;
1817         dword = pci_read_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL);
1818         dword &= (1<<bit);
1819         return dword;
1820 }
1821
1822 static void wait_till_sysinfo_in_ram(void)
1823 {
1824         while(1) {
1825                 if(get_htic_bit(0, 9)) return;
1826         }
1827 }
1828
1829 static void set_sysinfo_in_ram(unsigned val)
1830 {
1831         set_htic_bit(0, val, 9);
1832 }
1833
1834
1835 #if MEM_TRAIN_SEQ == 0
1836
1837
1838 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1839 static void dqs_timing(int controllers, const struct mem_controller *ctrl, tsc_t *tsc0, struct sys_info *sysinfo)
1840 #else
1841 static void dqs_timing(int controllers, const struct mem_controller *ctrl, struct sys_info *sysinfo)
1842 #endif
1843 {
1844         int  i;
1845
1846         tsc_t tsc[5];
1847
1848         //need to enable mtrr, so dqs training could access the test address
1849         setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
1850
1851         for(i = 0; i < controllers; i++) {
1852                 if (!sysinfo->ctrl_present[ i ])
1853                         continue;
1854
1855                 /* Skip everything if I don't have any memory on this controller */
1856                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1857
1858                 fill_mem_cs_sysinfo(i, ctrl+i, sysinfo);
1859         }
1860
1861         tsc[0] = rdtsc();
1862         for(i = 0; i < controllers; i++) {
1863                 if (!sysinfo->ctrl_present[ i ])
1864                         continue;
1865
1866                 /* Skip everything if I don't have any memory on this controller */
1867                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1868
1869                 print_debug("DQS Training:RcvrEn:Pass1: ");
1870                 print_debug_hex8(i);
1871                 if(train_DqsRcvrEn(ctrl+i, 1, sysinfo)) goto out;
1872                 print_debug(" done\r\n");
1873         }
1874
1875         tsc[1] = rdtsc();
1876 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1877         f0_svm_workaround(controllers, ctrl, tsc0, sysinfo);
1878 #endif
1879
1880         tsc[2] = rdtsc();
1881         for(i = 0; i < controllers; i++) {
1882                 if (!sysinfo->ctrl_present[i])
1883                         continue;
1884
1885                 /* Skip everything if I don't have any memory on this controller */
1886                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1887
1888                 print_debug("DQS Training:DQSPos: ");
1889                 print_debug_hex8(i);
1890                 if(train_DqsPos(ctrl+i, sysinfo)) goto out;
1891                 print_debug(" done\r\n");
1892         }
1893
1894         tsc[3] = rdtsc();
1895         for(i = 0; i < controllers; i++) {
1896                 if (!sysinfo->ctrl_present[i])
1897                         continue;
1898
1899                 /* Skip everything if I don't have any memory on this controller */
1900                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1901
1902                 print_debug("DQS Training:RcvrEn:Pass2: ");
1903                 print_debug_hex8(i);
1904                 if(train_DqsRcvrEn(ctrl+i, 2, sysinfo)) goto out;
1905                 print_debug(" done\r\n");
1906                 sysinfo->mem_trained[i]=1;
1907         }
1908
1909 out:
1910         tsc[4] = rdtsc();
1911         clear_mtrr_dqs(sysinfo->tom2_k);
1912
1913
1914         for(i=0;i<5;i++) {
1915                 print_debug_dqs_tsc_x("DQS Training:tsc", i,  tsc[i].hi, tsc[i].lo);
1916         }
1917
1918
1919         
1920 }
1921
1922 #endif
1923
1924
1925 #if MEM_TRAIN_SEQ > 0 
1926
1927 static void dqs_timing(int i, const struct mem_controller *ctrl, struct sys_info *sysinfo, unsigned v)
1928 {
1929
1930         int ii;
1931
1932          tsc_t tsc[4];
1933
1934         if(sysinfo->mem_trained[i] != 0x80) return;
1935
1936 #if MEM_TRAIN_SEQ == 1
1937         //need to enable mtrr, so dqs training could access the test address
1938         setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
1939 #endif
1940
1941         fill_mem_cs_sysinfo(i, ctrl, sysinfo);
1942
1943         if(v) {
1944                 tsc[0] = rdtsc();
1945
1946                 print_debug("set DQS timing:RcvrEn:Pass1: ");
1947                 print_debug_hex8(i);
1948         }
1949         if(train_DqsRcvrEn(ctrl, 1,  sysinfo)) {
1950                 sysinfo->mem_trained[i]=0x81; //
1951                 goto out;
1952         }
1953
1954         if(v) {
1955                 print_debug(" done\r\n");
1956                 tsc[1] = rdtsc();
1957                 print_debug("set DQS timing:DQSPos: ");
1958                 print_debug_hex8(i);
1959         }
1960
1961         if(train_DqsPos(ctrl, sysinfo)) {
1962                 sysinfo->mem_trained[i]=0x82; //
1963                 goto out;
1964         }
1965         
1966         if(v) {
1967                 print_debug(" done\r\n");
1968                 tsc[2] = rdtsc();
1969
1970                 print_debug("set DQS timing:RcvrEn:Pass2: ");
1971                 print_debug_hex8(i);
1972         }
1973         if(train_DqsRcvrEn(ctrl, 2,  sysinfo)){
1974                 sysinfo->mem_trained[i]=0x83; //
1975                 goto out;
1976         }
1977
1978         if(v) {
1979                 print_debug(" done\r\n");
1980
1981                 tsc[3] = rdtsc();
1982         }
1983
1984 out:
1985 #if MEM_TRAIN_SEQ == 1
1986         clear_mtrr_dqs(sysinfo->tom2_k);
1987 #endif
1988
1989         if(v) {
1990                 for(ii=0;ii<4;ii++) {
1991                       print_debug_dqs_tsc_x("Total DQS Training : tsc ", ii,  tsc[ii].hi, tsc[ii].lo);
1992                 }
1993         }
1994         
1995         if(sysinfo->mem_trained[i] == 0x80) {
1996                 sysinfo->mem_trained[i]=1;
1997         }
1998
1999 }
2000 #endif
2001
2002 #if MEM_TRAIN_SEQ == 1
2003 static void train_ram(unsigned nodeid, struct sys_info *sysinfo, struct sys_info *sysinfox)
2004 {
2005         dqs_timing(nodeid, &sysinfo->ctrl[nodeid], sysinfo, 0); // keep the output tidy
2006 //      memcpy(&sysinfox->dqs_rcvr_dly_a[nodeid * 2 * 8],&sysinfo->dqs_rcvr_dly_a[nodeid * 2 * 8], 2*8);
2007 //      memcpy(&sysinfox->dqs_delay_a[nodeid * 2 * 2 * 9], &sysinfo->dqs_delay_a[nodeid * 2 * 2 * 9], 2 * 2 * 9);
2008         sysinfox->mem_trained[nodeid] = sysinfo->mem_trained[nodeid];
2009
2010 }
2011 static void copy_and_run_ap_code_in_car(unsigned ret_addr);
2012 static inline void train_ram_on_node(unsigned nodeid, unsigned coreid, struct sys_info *sysinfo, unsigned retcall)
2013 {
2014         if(coreid) return; // only do it on core0
2015         struct sys_info *sysinfox = ((CONFIG_LB_MEM_TOPK<<10) - DCACHE_RAM_GLOBAL_VAR_SIZE);
2016         wait_till_sysinfo_in_ram(); // use pci to get it
2017
2018         if(sysinfox->mem_trained[nodeid] == 0x80) {
2019         #if 0
2020                 sysinfo->tom_k = sysinfox->tom_k;
2021                 sysinfo->tom2_k = sysinfox->tom2_k;
2022                 sysinfo->meminfo[nodeid].is_Width128 = sysinfox->meminfo[nodeid].is_Width128;
2023                 sysinfo->mem_trained[nodeid] = sysinfox->mem_trained[nodeid];
2024                 memcpy(&sysinfo->ctrl[nodeid], &sysinfox->ctrl[nodeid], sizeof(struct mem_controller));
2025         #else
2026                 memcpy(sysinfo, sysinfox, DCACHE_RAM_GLOBAL_VAR_SIZE);
2027         #endif
2028                 set_top_mem_ap(sysinfo->tom_k, sysinfo->tom2_k); // keep the ap's tom consistent with bsp's
2029         #if CONFIG_AP_CODE_IN_CAR == 0
2030                 print_debug("CODE IN ROM AND RUN ON NODE:"); print_debug_hex8(nodeid); print_debug("\r\n");
2031                 train_ram(nodeid, sysinfo, sysinfox);
2032         #else
2033                 /* Can copy dqs_timing to ap cache and run from cache?
2034                 * we need coreboot_ap_car.rom? and treat it as coreboot_ram.rom for ap ?
2035                 */
2036                 copy_and_run_ap_code_in_car(retcall);
2037                 // will go back by jump
2038         #endif
2039         }
2040 }
2041 #endif