Fix CPUID typo. This caused fid to memory speed calculations to be off.
[coreboot.git] / src / northbridge / amd / amdk8 / raminit_f_dqs.c
1 /*
2  * This file is part of the coreboot project.
3  *
4  * Copyright (C) 2005 YingHai Lu
5  * Copyright (C) 2008 Advanced Micro Devices, Inc.
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; version 2 of the License.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
19 */
20
21 //0: mean no debug info
22 #define DQS_TRAIN_DEBUG 0
23
24 #if CONFIG_USE_PRINTK_IN_CAR
25 #else
26 #error This file needs CONFIG_USE_PRINTK_IN_CAR
27 #endif
28
29 static inline void print_debug_dqs(const char *str, unsigned val, unsigned level)
30 {
31 #if DQS_TRAIN_DEBUG > 0
32         if(DQS_TRAIN_DEBUG > level) {
33                 printk_debug("%s%x\r\n", str, val);
34         }
35 #endif
36 }
37
38 static inline void print_debug_dqs_pair(const char *str, unsigned val, const char *str2, unsigned val2, unsigned level)
39 {
40 #if DQS_TRAIN_DEBUG > 0
41         if(DQS_TRAIN_DEBUG > level) {
42                 printk_debug("%s%08x%s%08x\r\n", str, val, str2, val2);
43         }
44 #endif
45 }
46
47 static inline void print_debug_dqs_tsc(const char *str, unsigned i, unsigned val, unsigned val2, unsigned level)
48 {
49 #if DQS_TRAIN_DEBUG > 0
50         if(DQS_TRAIN_DEBUG > level) {
51                 printk_debug("%s[%02x]=%08x%08x\r\n", str, i, val, val2);
52         }
53 #endif
54 }
55
56 static inline void print_debug_dqs_tsc_x(const char *str, unsigned i, unsigned val, unsigned val2)
57 {
58         printk_debug("%s[%02x]=%08x%08x\r\n", str, i, val, val2);
59
60 }
61
62 static void fill_mem_cs_sysinfo(unsigned nodeid, const struct mem_controller *ctrl, struct sys_info *sysinfo)
63 {
64
65         int i;
66         sysinfo->mem_base[nodeid] = pci_read_config32(ctrl->f1, 0x40 + (nodeid<<3));
67
68         for(i=0;i<8; i++) {
69                 sysinfo->cs_base[nodeid*8+i] = pci_read_config32(ctrl->f2, 0x40 + (i<<2));
70         }
71
72         sysinfo->hole_reg[nodeid] = pci_read_config32(ctrl->f1, 0xf0);
73
74 }
75 static unsigned Get_MCTSysAddr(const struct mem_controller *ctrl,  unsigned cs_idx, struct sys_info *sysinfo)
76 {
77         uint32_t dword;
78         uint32_t mem_base;
79         unsigned nodeid = ctrl->node_id;
80
81 #if HW_MEM_HOLE_SIZEK != 0
82         uint32_t hole_reg;
83 #endif
84
85         //get the local base addr of the chipselect
86         dword = sysinfo->cs_base[nodeid * 8 + cs_idx];
87         dword &= 0xfffffff0;
88
89         //sys addr= node base + local cs base
90         mem_base = sysinfo->mem_base[nodeid];
91         mem_base &= 0xffff0000;
92
93         dword += mem_base;
94 #if HW_MEM_HOLE_SIZEK != 0
95         hole_reg = sysinfo->hole_reg[nodeid];
96         if(hole_reg & 1) {
97                 unsigned hole_startk;
98                 hole_startk = (hole_reg & (0xff<<24)) >> 10;
99                 if( (dword >= (hole_startk<<2)) && (dword < ((4*1024*1024)<<2))) {
100                         dword += ((4*1024*1024 - hole_startk)<<2);
101                 }
102         }
103 #endif
104
105         //add 1MB offset to avoid compat area
106         dword += (1<<(20-8));
107
108         //So final result is upper 32 bit addr
109
110         return dword;
111
112 }
113
114 static unsigned Get_RcvrSysAddr(const struct mem_controller * ctrl, unsigned channel, unsigned cs_idx, struct sys_info *sysinfo)
115 {
116         return Get_MCTSysAddr(ctrl, cs_idx, sysinfo);
117
118 }
119
120 static inline unsigned long read_cr4(void)
121 {
122         unsigned long cr4;
123         asm volatile ("movl %%cr4, %0" : "=r" (cr4));
124         return cr4;
125 }
126
127 static inline void write_cr4(unsigned long cr4)
128 {
129         asm volatile ("movl %0, %%cr4" : : "r" (cr4));
130 }
131
132
133 static inline void enable_sse2()
134 {
135         unsigned long cr4;
136         cr4 = read_cr4();
137         cr4 |= (1<<9);
138         write_cr4(cr4);
139 }
140
141 static inline void disable_sse2()
142 {
143         unsigned long cr4;
144         cr4 = read_cr4();
145         cr4 &= ~(1<<9);
146         write_cr4(cr4);
147 }
148
149
150 static void set_wrap32dis(void) {
151         msr_t msr;
152
153         msr = rdmsr(0xc0010015);
154         msr.lo |= (1<<17);
155
156         wrmsr(0xc0010015, msr);
157
158 }
159
160 static void clear_wrap32dis(void) {
161         msr_t msr;
162
163         msr = rdmsr(0xc0010015);
164         msr.lo &= ~(1<<17);
165
166         wrmsr(0xc0010015, msr);
167
168 }
169
170 static void set_FSBASE(uint32_t addr_hi)
171 {
172         msr_t msr;
173
174         //set fs and use fs prefix to access the mem
175         msr.hi = addr_hi;
176         msr.lo = 0;
177         wrmsr(0xc0000100, msr); //FS_BASE
178
179 }
180
181 static unsigned ChipSelPresent(const struct mem_controller *ctrl, unsigned cs_idx, struct sys_info *sysinfo)
182 {
183         unsigned enabled;
184         unsigned nodeid = ctrl->node_id;
185
186
187         enabled = sysinfo->cs_base[nodeid * 8 + cs_idx];
188         enabled &= 1;
189
190         return enabled;
191
192 }
193
194 static unsigned RcvrRankEnabled(const struct mem_controller *ctrl, int channel, int cs_idx, unsigned is_Width128, struct sys_info *sysinfo)
195 {
196         return ChipSelPresent(ctrl, cs_idx, sysinfo);
197 }
198
199 static void WriteLNTestPattern(unsigned addr_lo, uint8_t *buf_a, unsigned line_num)
200 {
201         __asm__ volatile (
202                 "1:\n\t"
203                 "movdqa (%3), %%xmm0\n\t"
204                 "movntdq %%xmm0, %%fs:(%0)\n\t" /* xmm0 is 128 bit */
205                 "addl %1, %0\n\t"
206                 "addl %1, %3\n\t"
207                 "loop 1b\n\t"
208
209                 :: "a" (addr_lo), "d" (16), "c" (line_num * 4), "b"(buf_a)
210         );
211
212
213 }
214
215 static void Write1LTestPattern(unsigned addr, unsigned p, uint8_t *buf_a, uint8_t *buf_b)
216 {
217         uint8_t *buf;
218         if(p==1) { buf = buf_b; }
219         else { buf = buf_a; }
220
221         set_FSBASE (addr>>24);
222
223         WriteLNTestPattern(addr<<8, buf, 1);
224 }
225
226 static void Read1LTestPattern(unsigned addr)
227 {
228         unsigned value;
229
230         set_FSBASE(addr>>24);
231
232         /* 1st move causes read fill (to exclusive or shared)*/
233         __asm__ volatile (
234                 "movl %%fs:(%1), %0\n\t"
235                 :"=b"(value): "a" (addr<<8)
236         );
237
238 }
239
240 #define DQS_PASS 0
241 #define DQS_FAIL 1
242
243 #define DQS_FIRST_PASS 1
244 #define DQS_SECOND_PASS 2
245
246 #define SB_NORCVREN 11
247 #define RCVREN_MARGIN 6
248 #define SB_SmallRCVR 13
249 #define SB_CHA2BRCVREN 12
250 #define SB_NODQSPOS  14
251 #define MIN_DQS_WNDW 3
252 #define SB_SMALLDQS 15
253
254
255 static unsigned CompareTestPatternQW0(unsigned channel, unsigned addr, unsigned pattern, const uint32_t *TestPattern0, const uint32_t *TestPattern1, const uint32_t *TestPattern2, unsigned Pass, unsigned is_Width128)
256 {
257         uint32_t addr_lo;
258         uint32_t *test_buf;
259         uint32_t value;
260         uint32_t value_test;
261         unsigned result = DQS_FAIL;
262
263         if(Pass == DQS_FIRST_PASS) {
264                 if(pattern==1) {
265                         test_buf = (uint32_t *)TestPattern1;
266                 }
267                 else {
268                         test_buf = (uint32_t *)TestPattern0;
269                 }
270         }
271         else {
272                 test_buf = (uint32_t *)TestPattern2;
273         }
274
275         set_FSBASE(addr>>24);
276
277         addr_lo = addr<<8;
278
279         if(is_Width128 && (channel == 1)) {
280                 addr_lo += 8; //second channel
281                 test_buf += 2;
282         }
283
284         __asm__ volatile (
285                 "movl %%fs:(%1), %0\n\t"
286                 :"=b"(value): "a" (addr_lo)
287         );
288
289         value_test = *test_buf;
290
291
292         print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4);
293         print_debug_dqs_pair("\t\t\t\t\t\tQW0.lo : addr_lo = ", addr_lo, " value = ", value, 4);
294
295         if(value == value_test) {
296                 addr_lo += 4;
297                 test_buf++;
298                 __asm__ volatile (
299                         "movl %%fs:(%1), %0\n\t"
300                         :"=b"(value): "a" (addr_lo)
301                 );
302                 value_test = *test_buf;
303                 print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : test_buf= ", (unsigned)test_buf, " value = ", value_test, 4);
304                 print_debug_dqs_pair("\t\t\t\t\t\tQW0.hi : addr_lo = ", addr_lo, " value = ", value, 4);
305
306                 if(value == value_test){
307                         result =  DQS_PASS;
308                 }
309         }
310
311         if(Pass == DQS_SECOND_PASS) { // second pass need to be inverted
312                 if(result==DQS_PASS) {
313                         result = DQS_FAIL;
314                 }
315                 else {
316                         result = DQS_PASS;
317                 }
318         }
319
320         return result;
321
322 }
323
324 static void SetMaxAL_RcvrDly(const struct mem_controller *ctrl, unsigned dly)
325 {
326         uint32_t reg;
327
328         dly += (20-1); // round it
329         dly /= 20; // convert from unit 50ps to 1ns
330
331         dly += 6;
332
333
334         reg = pci_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
335         reg &= ~(DCH_MaxAsyncLat_MASK <<DCH_MaxAsyncLat_SHIFT);
336         reg |= ((dly - DCH_MaxAsyncLat_BASE) << DCH_MaxAsyncLat_SHIFT);
337         pci_write_config32(ctrl->f2, DRAM_CONFIG_HIGH, reg);
338
339 }
340
341 /*
342         Set the Target range to WT IO (using an IORR overlapping the already existing
343         WB dram type). Use IORR0
344 */
345 static void SetTargetWTIO(unsigned addr)
346 {
347         msr_t msr;
348         msr.hi = addr>>24;
349         msr.lo = addr<<8;
350         wrmsr(0xc0010016, msr); //IORR0 BASE
351
352         msr.hi = 0xff;
353         msr.lo = 0xfc000800;  // 64MB Mask
354         wrmsr(0xc0010017, msr); // IORR0 Mask
355 }
356
357 static void ResetTargetWTIO(void)
358 {
359         msr_t msr;
360
361         msr.hi = 0;
362         msr.lo = 0;
363         wrmsr(0xc0010017, msr); // IORR0 Mask
364 }
365
366 static void proc_CLFLUSH(unsigned addr)
367 {
368
369         set_FSBASE(addr>>24);
370
371         /* 1st move causes read fill (to exclusive or shared)*/
372         __asm__ volatile (
373                         /* clflush fs:[eax] */
374                 "clflush %%fs:(%0)\n\t"
375                 ::"a" (addr<<8)
376         );
377
378 }
379 static void proc_IOCLFLUSH(unsigned addr)
380 {
381         SetTargetWTIO(addr);
382         proc_CLFLUSH(addr);
383         ResetTargetWTIO();
384 }
385
386 static void ResetDCTWrPtr(const struct mem_controller *ctrl)
387 {
388         uint32_t dword;
389         unsigned index = 0x10;
390
391         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
392         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
393
394         index += 0x20;
395         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
396         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
397
398 }
399
400
401 static uint16_t get_exact_T1000(unsigned i)
402 {
403         //                                 200   266,   333,  400
404         static const uint16_t T1000_a[]= { 5000, 3759, 3003, 2500 };
405
406         static const uint16_t TT_a[] = {
407                  /*200   266   333   400 */
408          /*4 */   6250, 6250, 6250, 6250,
409          /*5 */   5000, 5000, 5000, 2500,
410          /*6 */   5000, 4166, 4166, 2500,
411          /*7 */   5000, 4285, 3571, 2500,
412
413          /*8 */   5000, 3750, 3125, 2500,
414          /*9 */   5000, 3888, 3333, 2500,
415          /*10*/   5000, 4000, 3000, 2500,
416          /*11*/   5000, 4090, 3181, 2500,
417
418          /*12*/   5000, 3750, 3333, 2500,
419          /*13*/   5000, 3846, 3076, 2500,
420          /*14*/   5000, 3928, 3214, 2500,
421          /*15*/   5000, 4000, 3000, 2500,
422         };
423
424         int index;
425         msr_t msr;
426
427         /* Check for FID control support */
428         struct cpuid_result cpuid1;
429         cpuid1 = cpuid(0x80000007);
430         if( cpuid1.edx & 0x02 ) {
431                 /* Use current FID */
432                 unsigned fid_cur;
433                 msr = rdmsr(0xc0010042);
434                 fid_cur = msr.lo & 0x3f;
435
436                 index = fid_cur>>1;
437         } else {
438                 /* Use startup FID */
439                 unsigned fid_start;
440                 msr = rdmsr(0xc0010015);
441                 fid_start = (msr.lo & (0x3f << 24));
442                 
443                 index = fid_start>>25;
444         }
445
446         if(index>12) return T1000_a[i];
447
448         return TT_a[index * 4+i];
449
450 }
451
452 static void InitDQSPos4RcvrEn(const struct mem_controller *ctrl)
453 {
454         int i;
455         uint32_t dword;
456
457         dword = 0x00000000;
458         for(i=1; i<=3; i++) {
459                 /* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x01-0x03, 0x21-0x23) to 0x00 for all bytes */
460                 pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
461                 pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
462         }
463
464         dword = 0x2f2f2f2f;
465         for(i=5; i<=7; i++) {
466                 /* Program the DQS Write Timing Control Registers (Function 2:Offset 0x9c, index 0x05-0x07, 0x25-0x27) to 0x2f for all bytes */
467                 pci_write_config32_index_wait(ctrl->f2, 0x98, i, dword);
468                 pci_write_config32_index_wait(ctrl->f2, 0x98, i+0x20, dword);
469         }
470
471
472 }
473 #ifndef K8_REV_F_SUPPORT_F0_F1_WORKAROUND
474 #define K8_REV_F_SUPPORT_F0_F1_WORKAROUND 1
475 #endif
476
477 static unsigned TrainRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
478 {
479
480         static const uint32_t TestPattern0[] = {
481                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
482                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
483                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
484                         0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa,
485                 };
486         static const uint32_t TestPattern1[] = {
487                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
488                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
489                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
490                         0x55555555, 0x55555555, 0x55555555, 0x55555555,
491                 };
492         static const uint32_t TestPattern2[] = {
493                         0x12345678, 0x87654321, 0x23456789, 0x98765432,
494                         0x59385824, 0x30496724, 0x24490795, 0x99938733,
495                         0x40385642, 0x38465245, 0x29432163, 0x05067894,
496                         0x12349045, 0x98723467, 0x12387634, 0x34587623,
497                 };
498
499         uint8_t pattern_buf_x[64 * 4 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */
500         uint8_t *buf_a, *buf_b;
501         uint32_t ecc_bit;
502         uint32_t dword;
503         uint8_t *dqs_rcvr_dly_a = &sysinfo->dqs_rcvr_dly_a[ctrl->node_id * 2* 8] ; //8 node, channel 2, receiver 8
504
505         int i;
506
507         unsigned channel, receiver;
508
509         unsigned Errors;
510         unsigned CTLRMaxDelay;
511         unsigned T1000;
512
513         unsigned LastTest;
514         unsigned CurrTest;
515         unsigned Test0, Test1;
516
517         unsigned RcvrEnDlyRmin;
518
519         unsigned two_ranks;
520         unsigned RcvrEnDly;
521
522         unsigned PatternA;
523         unsigned PatternB;
524
525         unsigned TestAddr0, TestAddr0B, TestAddr1, TestAddr1B = 0;
526
527         unsigned CurrRcvrCHADelay = 0;
528
529         unsigned tmp;
530
531         unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
532
533         unsigned cpu_f0_f1;
534
535         if(Pass == DQS_FIRST_PASS) {
536                 InitDQSPos4RcvrEn(ctrl);
537         }
538
539         //enable SSE2
540         enable_sse2();
541
542         //wrap32dis
543         set_wrap32dis();
544
545         //disable ECC temp
546         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
547         ecc_bit = dword & DCL_DimmEccEn;
548         dword &= ~(DCL_DimmEccEn);
549         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
550
551
552         if(Pass == DQS_FIRST_PASS) {
553 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
554         cpu_f0_f1 = is_cpu_pre_f2_in_bsp(ctrl->node_id);
555         if(!cpu_f0_f1)
556 #endif
557         {
558 #if 1
559                 /* Set the DqsRcvEnTrain bit */
560                 dword = pci_read_config32(ctrl->f2, DRAM_CTRL);
561                 dword |= DC_DqsRcvEnTrain;
562                 pci_write_config32(ctrl->f2, DRAM_CTRL, dword);
563 #endif
564         }
565         }
566
567         //get T1000 figures (cycle time (ns)) * 1K
568         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_HIGH);
569         dword &= DCH_MemClkFreq_MASK;
570
571         T1000 = get_exact_T1000(dword);
572
573         // SetupRcvrPattern
574         buf_a = (uint8_t *)(((uint32_t)(&pattern_buf_x[0]) + 0x10) & (0xfffffff0));
575         buf_b = buf_a + 128; //??
576         if(Pass==DQS_FIRST_PASS) {
577                 for(i=0;i<16;i++) {
578                         *((uint32_t *)(buf_a + i*4)) = TestPattern0[i];
579                         *((uint32_t *)(buf_b + i*4)) = TestPattern1[i];
580                 }
581         }
582         else {
583                 for(i=0;i<16;i++) {
584                         *((uint32_t *)(buf_a + i*4)) = TestPattern2[i];
585                         *((uint32_t *)(buf_b + i*4)) = TestPattern2[i];
586                 }
587         }
588
589         print_debug_dqs("\r\nTrainRcvEn: 0 ctrl", ctrl->node_id, 0);
590
591         print_debug_addr("TrainRcvEn: buf_a:", buf_a);
592
593         Errors = 0;
594         /* for each channel */
595         CTLRMaxDelay = 0;
596         channel = 0;
597
598         if (!(sysinfo->meminfo[ctrl->node_id].dimm_mask & 0x0F) &&
599              (sysinfo->meminfo[ctrl->node_id].dimm_mask & 0xF0)) { /* channelB only? */
600                 channel = 1;
601         }
602
603         for ( ; (channel < 2) && (!Errors); channel++)
604         { 
605                 print_debug_dqs("\tTrainRcvEn51: channel ",channel, 1); 
606                 
607                 /* for each rank */ 
608                 /* there are four recriver pairs, loosely associated with CS */ 
609                 for( receiver = 0; (receiver < 8) && (!Errors); receiver+=2) 
610                 {
611
612                         unsigned index=(receiver>>1) * 3 + 0x10;
613
614                         print_debug_dqs("\t\tTrainRcvEn52: index ", index, 2);
615
616                         if(is_Width128) {
617                                 if(channel) {
618                                         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
619                                         CurrRcvrCHADelay= dword & 0xff;
620                                 }
621                         }
622                         else {
623                                 if(channel) {
624                                         index += 0x20;
625                                 }
626                         }
627
628                         LastTest = DQS_FAIL;
629                         RcvrEnDlyRmin = 0xaf;
630
631                         if(!RcvrRankEnabled(ctrl, channel, receiver, is_Width128, sysinfo)) continue;
632
633                         /* for each DQS receiver enable setting */
634
635                         TestAddr0 = Get_RcvrSysAddr(ctrl, channel, receiver, sysinfo);
636
637                         TestAddr0B = TestAddr0 + (1<<(20+2-8)); // 4MB
638
639                         if(RcvrRankEnabled(ctrl, channel, receiver+1, is_Width128, sysinfo)) {
640                                 TestAddr1 = Get_RcvrSysAddr(ctrl, channel, receiver+1, sysinfo);
641                                 TestAddr1B = TestAddr1 + (1<<(20+2-8)); //4MB
642                                 two_ranks = 1;
643                         }
644                         else {
645                                 two_ranks = 0;
646                         }
647
648                         print_debug_dqs("\t\tTrainRcvEn53: TestAddr0B ", TestAddr0B, 2);
649
650                         Write1LTestPattern(TestAddr0, 0, buf_a, buf_b); // rank0 of dimm, test p0
651                         Write1LTestPattern(TestAddr0B, 1, buf_a, buf_b); //rank0 of dimm, test p1
652
653                         if(two_ranks == 1) {
654                                 Write1LTestPattern(TestAddr1, 0, buf_a, buf_b); //rank 1 of dimm
655                                 Write1LTestPattern(TestAddr1B, 1, buf_a, buf_b);//rank 1 of dimm
656                         }
657
658                         if(Pass == DQS_FIRST_PASS) {
659                                 RcvrEnDly = 0;
660                         } else {
661                                 RcvrEnDly = dqs_rcvr_dly_a[channel * 8 + receiver];
662                         }
663
664                         while ( RcvrEnDly < 0xaf) { // Sweep Delay value here
665                                 print_debug_dqs("\t\t\tTrainRcvEn541: RcvrEnDly ", RcvrEnDly, 3);
666
667                                 if(RcvrEnDly & 1) {
668                                         /* Odd steps get another pattern such that even
669                                            and odd steps alternate.
670                                            The pointers to the patterns will be swapped
671                                            at the end of the loop so they are correspond
672                                         */
673                                         PatternA = 1;
674                                         PatternB = 0;
675                                 }
676                                 else {
677                                         /* Even step */
678                                         PatternA = 0;
679                                         PatternB = 1;
680                                 }
681
682                                 /* Program current Receiver enable delay */
683                                 pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
684                                 /* FIXME: 64bit MUX */
685
686                                 if(is_Width128) {
687                                         /* Program current Receiver enable delay chaannel b */
688                                         pci_write_config32_index_wait(ctrl->f2, 0x98, index+ 0x20, RcvrEnDly);
689                                 }
690
691                                 /* Program the MaxAsyncLat filed with the
692                                    current DQS receiver enable setting plus 6ns
693                                 */
694                                 /*Porgram MaxAsyncLat to correspond with current delay */
695                                 SetMaxAL_RcvrDly(ctrl, RcvrEnDly);
696
697                                 CurrTest = DQS_FAIL;
698
699                                 Read1LTestPattern(TestAddr0);  //Cache Fill
700                                 /* ROM vs cache compare */
701                                 Test0 = CompareTestPatternQW0(channel, TestAddr0, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
702                                 proc_IOCLFLUSH(TestAddr0);
703
704                                 ResetDCTWrPtr(ctrl);
705
706                                 print_debug_dqs("\t\t\tTrainRcvEn542: Test0 ", Test0, 3);
707
708                                 if(Test0 == DQS_PASS) {
709
710                                         Read1LTestPattern(TestAddr0B);
711                                         Test1 = CompareTestPatternQW0(channel, TestAddr0B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
712                                         proc_IOCLFLUSH(TestAddr0B);
713
714                                         ResetDCTWrPtr(ctrl);
715
716                                         print_debug_dqs("\t\t\tTrainRcvEn543: Test1 ", Test1, 3);
717
718                                         if(Test1 == DQS_PASS) {
719                                                 if(two_ranks) {
720                                                         Read1LTestPattern(TestAddr1);
721                                                         Test0 = CompareTestPatternQW0(channel, TestAddr1, PatternA, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
722                                                         proc_IOCLFLUSH(TestAddr1);
723                                                         ResetDCTWrPtr(ctrl);
724
725                                                         if(Test0 == DQS_PASS) {
726                                                                 Read1LTestPattern(TestAddr1B);
727                                                                 Test1 = CompareTestPatternQW0(channel, TestAddr1B, PatternB, TestPattern0, TestPattern1, TestPattern2, Pass, is_Width128);
728                                                                 proc_IOCLFLUSH(TestAddr1B);
729                                                                 ResetDCTWrPtr(ctrl);
730
731                                                                 if(Test1 == DQS_PASS) {
732                                                                         CurrTest = DQS_PASS;
733                                                                 }
734                                                         }
735                                                         print_debug_dqs("\t\t\tTrainRcvEn544: Test0 ", Test0, 3);
736                                                 }
737                                                 else {
738                                                         CurrTest = DQS_PASS;
739                                                 }
740                                         }
741                                 }
742
743                                 print_debug_dqs("\t\t\tTrainRcvEn55: RcvrEnDly ", RcvrEnDly, 3);
744
745                                 if(CurrTest == DQS_PASS) {
746                                         if(LastTest == DQS_FAIL) {
747                                                 RcvrEnDlyRmin = RcvrEnDly;
748                                                 break;
749                                         }
750                                 }
751
752                                 LastTest = CurrTest;
753
754                                 /* swap the rank 0 pointers */
755                                 tmp = TestAddr0;
756                                 TestAddr0 = TestAddr0B;
757                                 TestAddr0B = tmp;
758
759                                 /* swap the rank 1 pointers */
760                                 tmp = TestAddr1;
761                                 TestAddr1 = TestAddr1B;
762                                 TestAddr1B = tmp;
763
764                                 print_debug_dqs("\t\t\tTrainRcvEn56: RcvrEnDly ", RcvrEnDly, 3);
765
766                                 RcvrEnDly++;
767
768                         } // while RcvrEnDly
769
770                         print_debug_dqs("\t\tTrainRcvEn61: RcvrEnDly ", RcvrEnDly, 2);
771
772                         if(RcvrEnDlyRmin == 0xaf) {
773                                 //no passing window
774                                 Errors |= SB_NORCVREN;
775                         }
776
777                         if(Pass == DQS_FIRST_PASS) {
778                                 // We need a better value for DQSPos trainning
779                                 RcvrEnDly = RcvrEnDlyRmin /* + RCVREN_MARGIN * T1000/64/50 */;
780                         } else {
781                                 RcvrEnDly = RcvrEnDlyRmin;
782                         }
783
784                         if(RcvrEnDly > 0xae) {
785                                 //passing window too narrow, too far delayed
786                                 Errors |= SB_SmallRCVR;
787                                 RcvrEnDly = 0xae;
788                         }
789
790                         if(Pass == DQS_SECOND_PASS) { //second pass must average vales
791                                 RcvrEnDly += dqs_rcvr_dly_a[channel * 8 + receiver] /* - (RCVREN_MARGIN * T1000/64/50)*/;
792                                 RcvrEnDly >>= 1;
793                         }
794
795                         dqs_rcvr_dly_a[channel * 8 + receiver] = RcvrEnDly;
796
797                         //Set final RcvrEnDly for this DIMM and Channel
798                         pci_write_config32_index_wait(ctrl->f2, 0x98, index, RcvrEnDly);
799
800                         if(is_Width128) {
801                                 pci_write_config32_index_wait(ctrl->f2, 0x98, index+0x20, RcvrEnDly); // channel B
802                                 if(channel) {
803                                         pci_write_config32_index_wait(ctrl->f2, 0x98, index, CurrRcvrCHADelay);
804                                         if(RcvrEnDly > CurrRcvrCHADelay) {
805                                                 dword = RcvrEnDly - CurrRcvrCHADelay;
806                                         }
807                                         else {
808                                                 dword = CurrRcvrCHADelay - RcvrEnDly;
809                                         }
810                                         dword *= 50;
811                                         if(dword > T1000) {
812                                                 Errors |= SB_CHA2BRCVREN;
813                                         }
814                                 }
815                         }
816
817                         print_debug_dqs("\t\tTrainRcvEn63: RcvrEnDly ", RcvrEnDly, 2);
818
819                         if(RcvrEnDly > CTLRMaxDelay) {
820                                 CTLRMaxDelay = RcvrEnDly;
821                         }
822
823                         print_debug_dqs("\t\tTrainRcvEn64: CTLRMaxDelay ", CTLRMaxDelay, 2);
824
825                 } /* receiver */
826         } /* channel */
827
828         print_debug_dqs("\tTrainRcvEn65: CTLRMaxDelay ", CTLRMaxDelay, 1);
829
830         /* Program the MaxAsysncLat field with the largest DQS Receiver Enable setting */
831         SetMaxAL_RcvrDly(ctrl, CTLRMaxDelay);
832         ResetDCTWrPtr(ctrl);
833
834         //Enable ECC again
835         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
836         dword &= ~(DCL_DimmEccEn);
837         dword |= ecc_bit;
838         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
839
840         if(Pass == DQS_FIRST_PASS) {
841 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
842         if(!cpu_f0_f1)
843 #endif
844         {
845                 dword = pci_read_config32(ctrl->f2, DRAM_CTRL);
846                 dword &= ~DC_DqsRcvEnTrain;
847                 pci_write_config32(ctrl->f2, DRAM_CTRL, dword);
848         }
849         }
850
851         //Clear wrap32dis
852
853         clear_wrap32dis();
854
855         //restore SSE2 setting
856         disable_sse2();
857
858 #if MEM_TRAIN_SEQ != 1
859         /* We need tidy output for type 1 */
860         printk_debug(" CTLRMaxDelay=%02x\n", CTLRMaxDelay);
861 #endif
862
863         return (CTLRMaxDelay==0xae)?1:0;
864
865 }
866
867 #define DQS_READDIR 1
868 #define DQS_WRITEDIR 0
869
870
871 static void SetDQSDelayCSR(const struct mem_controller *ctrl, unsigned channel, unsigned bytelane, unsigned direction, unsigned dqs_delay)
872 { //ByteLane could be 0-8, last is for ECC
873         unsigned index;
874         uint32_t dword;
875         unsigned shift;
876
877         dqs_delay &= 0xff;
878
879         index = (bytelane>>2) + 1 + channel * 0x20 + (direction << 2);
880         shift = bytelane;
881         while(shift>3) {
882                 shift-=4;
883         }
884         shift <<= 3; // 8 bit
885
886         dword = pci_read_config32_index_wait(ctrl->f2, 0x98, index);
887         dword &= ~(0x3f<<shift);
888         dword |= (dqs_delay<<shift);
889         pci_write_config32_index_wait(ctrl->f2, 0x98, index, dword);
890
891 }
892
893 static void SetDQSDelayAllCSR(const struct mem_controller *ctrl, unsigned channel, unsigned direction, unsigned dqs_delay)
894 {
895         unsigned index;
896         uint32_t dword;
897         int i;
898
899         dword = 0;
900         dqs_delay &= 0xff;
901         for(i=0;i<4;i++) {
902                 dword |= dqs_delay<<(i*8);
903         }
904
905         index = 1 + channel * 0x20 + direction * 4;
906
907         for(i=0; i<2; i++) {
908                 pci_write_config32_index_wait(ctrl->f2, 0x98, index + i, dword);
909         }
910
911 }
912
913 static unsigned MiddleDQS(unsigned min_d, unsigned max_d)
914 {
915         unsigned size_d;
916         size_d = max_d-min_d;
917         if(size_d & 1) { //need round up
918                 min_d++;
919         }
920         return ( min_d + (size_d>>1));
921 }
922
923 static  inline void save_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, uint8_t *dqs_delay_a, uint8_t dqs_delay)
924 {
925         dqs_delay_a[channel * 2*9 + direction * 9 + bytelane] = dqs_delay;
926 }
927
928 static void WriteDQSTestPattern(unsigned addr_lo, unsigned pattern , uint8_t *buf_a)
929 {
930         WriteLNTestPattern(addr_lo, buf_a, (pattern+1) * 9);
931 }
932
933 static void ReadL18TestPattern(unsigned addr_lo)
934 {
935         //set fs and use fs prefix to access the mem
936         __asm__ volatile (
937                 "movl %%fs:-128(%%esi), %%eax\n\t"  //TestAddr cache line
938                 "movl %%fs:-64(%%esi), %%eax\n\t"   //+1
939                 "movl %%fs:(%%esi), %%eax\n\t"  //+2
940                 "movl %%fs:64(%%esi), %%eax\n\t"   //+3
941
942                 "movl %%fs:-128(%%edi), %%eax\n\t"      //+4
943                 "movl %%fs:-64(%%edi), %%eax\n\t"       //+5
944                 "movl %%fs:(%%edi), %%eax\n\t"  //+6
945                 "movl %%fs:64(%%edi), %%eax\n\t"        //+7
946
947                 "movl %%fs:-128(%%ebx), %%eax\n\t"  //+8
948                 "movl %%fs:-64(%%ebx), %%eax\n\t"       //+9
949                 "movl %%fs:(%%ebx), %%eax\n\t"  //+10
950                 "movl %%fs:64(%%ebx), %%eax\n\t"        //+11
951
952                 "movl %%fs:-128(%%ecx), %%eax\n\t"      //+12
953                 "movl %%fs:-64(%%ecx), %%eax\n\t"       //+13
954                 "movl %%fs:(%%ecx), %%eax\n\t"  //+14
955                 "movl %%fs:64(%%ecx), %%eax\n\t"        //+15
956
957                 "movl %%fs:-128(%%edx), %%eax\n\t"      //+16
958                 "movl %%fs:-64(%%edx), %%eax\n\t"       //+17
959
960                 :: "a"(0), "b" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "S"(addr_lo+128), "D"(addr_lo+128+4*64)
961         );
962
963 }
964
965 static void ReadL9TestPattern(unsigned addr_lo)
966 {
967
968         //set fs and use fs prefix to access the mem
969         __asm__ volatile (
970
971                 "movl %%fs:-128(%%ecx), %%eax\n\t"  //TestAddr cache line
972                 "movl %%fs:-64(%%ecx), %%eax\n\t"   //+1
973                 "movl %%fs:(%%ecx), %%eax\n\t"      //+2
974                 "movl %%fs:64(%%ecx), %%eax\n\t"   //+3
975
976                 "movl %%fs:-128(%%edx), %%eax\n\t"  //+4
977                 "movl %%fs:-64(%%edx), %%eax\n\t"   //+5
978                 "movl %%fs:(%%edx), %%eax\n\t"      //+6
979                 "movl %%fs:64(%%edx), %%eax\n\t"   //+7
980
981                 "movl %%fs:-128(%%ebx), %%eax\n\t"      //+8
982
983                 :: "a"(0), "b" (addr_lo+128+8*64), "c"(addr_lo+128), "d"(addr_lo+128+4*64)
984         );
985
986 }
987
988
989 static void ReadDQSTestPattern(unsigned addr_lo, unsigned pattern)
990 {
991         if(pattern == 0) {
992                 ReadL9TestPattern(addr_lo);
993         }
994         else {
995                 ReadL18TestPattern(addr_lo);
996         }
997 }
998
999 static void FlushDQSTestPattern_L9(unsigned addr_lo)
1000 {
1001         __asm__ volatile (
1002                 "clflush %%fs:-128(%%ecx)\n\t"
1003                 "clflush %%fs:-64(%%ecx)\n\t"
1004                 "clflush %%fs:(%%ecx)\n\t"
1005                 "clflush %%fs:64(%%ecx)\n\t"
1006
1007                 "clflush %%fs:-128(%%eax)\n\t"
1008                 "clflush %%fs:-64(%%eax)\n\t"
1009                 "clflush %%fs:(%%eax)\n\t"
1010                 "clflush %%fs:64(%%eax)\n\t"
1011
1012                 "clflush %%fs:-128(%%ebx)\n\t"
1013
1014                 ::  "b" (addr_lo+128+8*64), "c"(addr_lo+128), "a"(addr_lo+128+4*64)
1015         );
1016
1017 }
1018 static __attribute__((noinline)) void FlushDQSTestPattern_L18(unsigned addr_lo)
1019 {
1020        __asm__ volatile (
1021                 "clflush %%fs:-128(%%eax)\n\t"
1022                 "clflush %%fs:-64(%%eax)\n\t"
1023                 "clflush %%fs:(%%eax)\n\t"
1024                 "clflush %%fs:64(%%eax)\n\t"
1025
1026                 "clflush %%fs:-128(%%edi)\n\t"
1027                 "clflush %%fs:-64(%%edi)\n\t"
1028                 "clflush %%fs:(%%edi)\n\t"
1029                 "clflush %%fs:64(%%edi)\n\t"
1030
1031                 "clflush %%fs:-128(%%ebx)\n\t"
1032                 "clflush %%fs:-64(%%ebx)\n\t"
1033                 "clflush %%fs:(%%ebx)\n\t"
1034                 "clflush %%fs:64(%%ebx)\n\t"
1035
1036                 "clflush %%fs:-128(%%ecx)\n\t"
1037                 "clflush %%fs:-64(%%ecx)\n\t"
1038                 "clflush %%fs:(%%ecx)\n\t"
1039                 "clflush %%fs:64(%%ecx)\n\t"
1040
1041                 "clflush %%fs:-128(%%edx)\n\t"
1042                 "clflush %%fs:-64(%%edx)\n\t"
1043
1044                 :: "b" (addr_lo+128+8*64), "c" (addr_lo+128+12*64), "d" (addr_lo +128+16*64), "a"(addr_lo+128), "D"(addr_lo+128+4*64)
1045         );
1046 }
1047
1048 static void FlushDQSTestPattern(unsigned addr_lo, unsigned pattern )
1049 {
1050
1051         if(pattern == 0){
1052                 FlushDQSTestPattern_L9(addr_lo);
1053         }
1054         else {
1055                 FlushDQSTestPattern_L18(addr_lo);
1056         }
1057 }
1058
1059 static unsigned CompareDQSTestPattern(unsigned channel, unsigned addr_lo, unsigned pattern, uint8_t *buf_a)
1060 {
1061         uint32_t *test_buf;
1062         unsigned bitmap = 0xff;
1063         unsigned bytelane;
1064         int i;
1065         uint32_t value;
1066         int j;
1067         uint32_t value_test;
1068
1069         test_buf = (uint32_t *)buf_a;
1070
1071
1072         if(pattern && channel) {
1073                 addr_lo += 8; //second channel
1074                 test_buf+= 2;
1075         }
1076
1077         bytelane = 0;
1078         for(i=0;i<9*64/4;i++) {
1079                 __asm__ volatile (
1080                         "movl %%fs:(%1), %0\n\t"
1081                         :"=b"(value): "a" (addr_lo)
1082                 );
1083                 value_test = *test_buf;
1084
1085                 print_debug_dqs_pair("\t\t\t\t\t\ttest_buf= ", (unsigned)test_buf, " value = ", value_test, 7);
1086                 print_debug_dqs_pair("\t\t\t\t\t\ttaddr_lo = ",addr_lo, " value = ", value, 7);
1087
1088                 for(j=0;j<4*8;j+=8) {
1089                         if(((value>>j)&0xff) != ((value_test>>j)& 0xff)) {
1090                                 bitmap &= ~(1<<bytelane);
1091                         }
1092
1093                         bytelane++;
1094                         bytelane &= 0x7;
1095                 }
1096                 print_debug_dqs("\t\t\t\t\t\tbitmap = ", bitmap, 7);
1097
1098                 if(bytelane == 0) {
1099                         if(pattern == 1) { //dual channel
1100                                 addr_lo += 8; //skip over other channel's data
1101                                 test_buf += 2;
1102                         }
1103                 }
1104                 addr_lo += 4;
1105                 test_buf +=1;
1106
1107         }
1108
1109
1110         return bitmap;
1111
1112 }
1113
1114 static unsigned TrainDQSPos(const struct mem_controller *ctrl, unsigned channel, unsigned Direction, unsigned Pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1115 {
1116         unsigned ByteLane;
1117         unsigned Errors;
1118         unsigned BanksPresent;
1119
1120         unsigned MutualCSPassW[48];
1121
1122         unsigned ChipSel;
1123         unsigned DQSDelay;
1124
1125         unsigned TestAddr;
1126
1127         unsigned LastTest;
1128         unsigned RnkDlyFilterMax, RnkDlyFilterMin = 0;
1129         unsigned RnkDlySeqPassMax, RnkDlySeqPassMin = 0;
1130
1131         Errors = 0;
1132         BanksPresent = 0;
1133
1134         print_debug_dqs("\t\t\tTrainDQSPos begin ", 0, 3);
1135
1136         printk_debug("TrainDQSPos: MutualCSPassW[48] :%p\n", MutualCSPassW);
1137
1138         for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
1139                 MutualCSPassW[DQSDelay] = 0xff; // Bitmapped status per delay setting, 0xff=All positions passing (1= PASS)
1140         }
1141
1142         for(ChipSel = 0; ChipSel < 8; ChipSel++) { //logical register chipselects 0..7
1143                 print_debug_dqs("\t\t\t\tTrainDQSPos: 11 ChipSel ", ChipSel, 4);
1144                 //FIXME: process 64MUXedMode
1145                 if(!ChipSelPresent(ctrl, ChipSel, sysinfo)) continue;
1146                 BanksPresent  = 1;
1147
1148                 TestAddr = Get_MCTSysAddr(ctrl, ChipSel, sysinfo);
1149
1150                 print_debug_dqs("\t\t\t\tTrainDQSPos: 12 TestAddr ", TestAddr, 4);
1151
1152                 //set fs and use fs prefix to access the mem
1153                 set_FSBASE(TestAddr>>24);
1154
1155                 if(Direction == DQS_READDIR) {
1156                         print_debug_dqs("\t\t\t\tTrainDQSPos: 13 for read so write at first", 0, 4);
1157                         WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a);
1158                 }
1159
1160                 for(DQSDelay = 0; DQSDelay < 48; DQSDelay++ ){
1161                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 141 DQSDelay ", DQSDelay, 5);
1162                         if(MutualCSPassW[DQSDelay] == 0) continue; //skip current delay value if other chipselects have failed all 8 bytelanes
1163                         SetDQSDelayAllCSR(ctrl, channel, Direction, DQSDelay);
1164                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 142 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1165                         if(Direction == DQS_WRITEDIR) {
1166                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 143 for write", 0, 5);
1167                                 WriteDQSTestPattern(TestAddr<<8, Pattern, buf_a);
1168                         }
1169                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 144 Pattern ", Pattern, 5);
1170                         ReadDQSTestPattern(TestAddr<<8, Pattern);
1171                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 145 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1172                         MutualCSPassW[DQSDelay] &= CompareDQSTestPattern(channel, TestAddr<<8, Pattern, buf_a); //0: fail, 1=pass
1173                         print_debug_dqs("\t\t\t\t\tTrainDQSPos: 146 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1174                         SetTargetWTIO(TestAddr);
1175                         FlushDQSTestPattern(TestAddr<<8, Pattern);
1176                         ResetTargetWTIO();
1177                 }
1178         }
1179
1180         if(BanksPresent)
1181         for(ByteLane = 0; ByteLane < 8; ByteLane++) {
1182                 print_debug_dqs("\t\t\t\tTrainDQSPos: 31 ByteLane ",ByteLane, 4);
1183
1184                 LastTest = DQS_FAIL;
1185                 RnkDlySeqPassMax = 0;
1186                 RnkDlyFilterMax = 0;
1187                 RnkDlyFilterMin = 0;
1188                 for(DQSDelay=0; DQSDelay<48; DQSDelay++) {
1189                         if(MutualCSPassW[DQSDelay] & (1<<ByteLane)) {
1190
1191                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 321 DQSDelay ", DQSDelay, 5);
1192                                 print_debug_dqs("\t\t\t\t\tTrainDQSPos: 322 MutualCSPassW ", MutualCSPassW[DQSDelay], 5);
1193
1194                                 RnkDlySeqPassMax = DQSDelay;
1195                                 if(LastTest == DQS_FAIL) {
1196                                         RnkDlySeqPassMin = DQSDelay; //start sequential run
1197                                 }
1198                                 if((RnkDlySeqPassMax - RnkDlySeqPassMin)>(RnkDlyFilterMax-RnkDlyFilterMin)){
1199                                         RnkDlyFilterMin = RnkDlySeqPassMin;
1200                                         RnkDlyFilterMax = RnkDlySeqPassMax;
1201                                 }
1202                                 LastTest = DQS_PASS;
1203                         }
1204                         else {
1205                                 LastTest = DQS_FAIL;
1206                         }
1207                 }
1208                 print_debug_dqs("\t\t\t\tTrainDQSPos: 33 RnkDlySeqPassMax ", RnkDlySeqPassMax, 4);
1209
1210                 if(RnkDlySeqPassMax == 0) {
1211                         Errors |= SB_NODQSPOS; // no passing window
1212                 }
1213                 else {
1214                         print_debug_dqs("\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMax ", RnkDlyFilterMax, 4);
1215                         print_debug_dqs("\t\t\t\tTrainDQSPos: 34 RnkDlyFilterMin ", RnkDlyFilterMin, 4);
1216                         if((RnkDlyFilterMax - RnkDlyFilterMin)< MIN_DQS_WNDW){
1217                                 Errors |= SB_SMALLDQS;
1218                         }
1219                         else {
1220                                 unsigned middle_dqs;
1221                                 middle_dqs = MiddleDQS(RnkDlyFilterMin, RnkDlyFilterMax);
1222                                 print_debug_dqs("\t\t\t\tTrainDQSPos: 35 middle_dqs ",middle_dqs, 4);
1223                                 SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, middle_dqs);
1224                                 save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, middle_dqs);
1225                         }
1226                 }
1227
1228         }
1229
1230         print_debug_dqs("\t\t\tTrainDQSPos: end", 0xff, 3);
1231
1232         return Errors;
1233
1234
1235 }
1236
1237 static unsigned TrainReadDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1238 {
1239         print_debug_dqs("\t\tTrainReadPos", 0, 2);
1240         return TrainDQSPos(ctrl, channel, DQS_READDIR, pattern, buf_a, dqs_delay_a, sysinfo);
1241 }
1242
1243 static unsigned TrainWriteDQS(const struct mem_controller *ctrl, unsigned channel, unsigned pattern, uint8_t *buf_a, uint8_t *dqs_delay_a, struct sys_info *sysinfo)
1244 {
1245         print_debug_dqs("\t\tTrainWritePos", 0, 2);
1246         return TrainDQSPos(ctrl, channel, DQS_WRITEDIR, pattern, buf_a, dqs_delay_a, sysinfo);
1247 }
1248
1249
1250
1251 static unsigned TrainDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1252 {
1253         static const uint32_t TestPatternJD1a[] = {
1254                                         0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW0-1, ALL-EVEN
1255                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW2-3, ALL-EVEN
1256                                         0x00000000,0x00000000,0xFFFFFFFF,0xFFFFFFFF, // QW4-5, ALL-EVEN
1257                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW6-7, ALL-EVEN
1258                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW0-1, DQ0-ODD
1259                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW2-3, DQ0-ODD
1260                                         0x01010101,0x01010101,0xFeFeFeFe,0xFeFeFeFe, // QW4-5, DQ0-ODD
1261                                         0xFeFeFeFe,0xFeFeFeFe,0x01010101,0x01010101, // QW6-7, DQ0-ODD
1262                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW0-1, DQ1-ODD
1263                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2-3, DQ1-ODD
1264                                         0xFdFdFdFd,0xFdFdFdFd,0x02020202,0x02020202, // QW4-5, DQ1-ODD
1265                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW6-7, DQ1-ODD
1266                                         0x04040404,0x04040404,0xfBfBfBfB,0xfBfBfBfB, // QW0-1, DQ2-ODD
1267                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW2-3, DQ2-ODD
1268                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4-5, DQ2-ODD
1269                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6-7, DQ2-ODD
1270                                         0x08080808,0x08080808,0xF7F7F7F7,0xF7F7F7F7, // QW0-1, DQ3-ODD
1271                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW2-3, DQ3-ODD
1272                                         0xF7F7F7F7,0xF7F7F7F7,0x08080808,0x08080808, // QW4-5, DQ3-ODD
1273                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6-7, DQ3-ODD
1274                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW0-1, DQ4-ODD
1275                                         0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW2-3, DQ4-ODD
1276                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4-5, DQ4-ODD
1277                                         0xeFeFeFeF,0xeFeFeFeF,0x10101010,0x10101010, // QW6-7, DQ4-ODD
1278                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0-1, DQ5-ODD
1279                                         0xdFdFdFdF,0xdFdFdFdF,0x20202020,0x20202020, // QW2-3, DQ5-ODD
1280                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4-5, DQ5-ODD
1281                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6-7, DQ5-ODD
1282                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0-1, DQ6-ODD
1283                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW2-3, DQ6-ODD
1284                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW4-5, DQ6-ODD
1285                                         0x40404040,0x40404040,0xBfBfBfBf,0xBfBfBfBf, // QW6-7, DQ6-ODD
1286                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW0-1, DQ7-ODD
1287                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW2-3, DQ7-ODD
1288                                         0x80808080,0x80808080,0x7F7F7F7F,0x7F7F7F7F, // QW4-5, DQ7-ODD
1289                                         0x80808080,0x80808080,0x80808080,0x80808080  // QW6-7, DQ7-ODD
1290                 };
1291         static const uint32_t TestPatternJD1b[] = {
1292                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW0,CHA-B, ALL-EVEN
1293                                         0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW1,CHA-B, ALL-EVEN
1294                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW2,CHA-B, ALL-EVEN
1295                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW3,CHA-B, ALL-EVEN
1296                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW4,CHA-B, ALL-EVEN
1297                                         0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF, // QW5,CHA-B, ALL-EVEN
1298                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW6,CHA-B, ALL-EVEN
1299                                         0x00000000,0x00000000,0x00000000,0x00000000, // QW7,CHA-B, ALL-EVEN
1300                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW0,CHA-B, DQ0-ODD
1301                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW1,CHA-B, DQ0-ODD
1302                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW2,CHA-B, DQ0-ODD
1303                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW3,CHA-B, DQ0-ODD
1304                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW4,CHA-B, DQ0-ODD
1305                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW5,CHA-B, DQ0-ODD
1306                                         0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe,0xFeFeFeFe, // QW6,CHA-B, DQ0-ODD
1307                                         0x01010101,0x01010101,0x01010101,0x01010101, // QW7,CHA-B, DQ0-ODD
1308                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW0,CHA-B, DQ1-ODD
1309                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW1,CHA-B, DQ1-ODD
1310                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW2,CHA-B, DQ1-ODD
1311                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW3,CHA-B, DQ1-ODD
1312                                         0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd,0xFdFdFdFd, // QW4,CHA-B, DQ1-ODD
1313                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW5,CHA-B, DQ1-ODD
1314                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW6,CHA-B, DQ1-ODD
1315                                         0x02020202,0x02020202,0x02020202,0x02020202, // QW7,CHA-B, DQ1-ODD
1316                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW0,CHA-B, DQ2-ODD
1317                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW1,CHA-B, DQ2-ODD
1318                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW2,CHA-B, DQ2-ODD
1319                                         0x04040404,0x04040404,0x04040404,0x04040404, // QW3,CHA-B, DQ2-ODD
1320                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW4,CHA-B, DQ2-ODD
1321                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW5,CHA-B, DQ2-ODD
1322                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW6,CHA-B, DQ2-ODD
1323                                         0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB,0xfBfBfBfB, // QW7,CHA-B, DQ2-ODD
1324                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW0,CHA-B, DQ3-ODD
1325                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW1,CHA-B, DQ3-ODD
1326                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW2,CHA-B, DQ3-ODD
1327                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW3,CHA-B, DQ3-ODD
1328                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW4,CHA-B, DQ3-ODD
1329                                         0x08080808,0x08080808,0x08080808,0x08080808, // QW5,CHA-B, DQ3-ODD
1330                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW6,CHA-B, DQ3-ODD
1331                                         0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7,0xF7F7F7F7, // QW7,CHA-B, DQ3-ODD
1332                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW0,CHA-B, DQ4-ODD
1333                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW1,CHA-B, DQ4-ODD
1334                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW2,CHA-B, DQ4-ODD
1335                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW3,CHA-B, DQ4-ODD
1336                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW4,CHA-B, DQ4-ODD
1337                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW5,CHA-B, DQ4-ODD
1338                                         0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF,0xeFeFeFeF, // QW6,CHA-B, DQ4-ODD
1339                                         0x10101010,0x10101010,0x10101010,0x10101010, // QW7,CHA-B, DQ4-ODD
1340                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW0,CHA-B, DQ5-ODD
1341                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW1,CHA-B, DQ5-ODD
1342                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW2,CHA-B, DQ5-ODD
1343                                         0x20202020,0x20202020,0x20202020,0x20202020, // QW3,CHA-B, DQ5-ODD
1344                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW4,CHA-B, DQ5-ODD
1345                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW5,CHA-B, DQ5-ODD
1346                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW6,CHA-B, DQ5-ODD
1347                                         0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF,0xdFdFdFdF, // QW7,CHA-B, DQ5-ODD
1348                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW0,CHA-B, DQ6-ODD
1349                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW1,CHA-B, DQ6-ODD
1350                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW2,CHA-B, DQ6-ODD
1351                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW3,CHA-B, DQ6-ODD
1352                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW4,CHA-B, DQ6-ODD
1353                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW5,CHA-B, DQ6-ODD
1354                                         0x40404040,0x40404040,0x40404040,0x40404040, // QW6,CHA-B, DQ6-ODD
1355                                         0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf,0xBfBfBfBf, // QW7,CHA-B, DQ6-ODD
1356                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW0,CHA-B, DQ7-ODD
1357                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW1,CHA-B, DQ7-ODD
1358                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW2,CHA-B, DQ7-ODD
1359                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW3,CHA-B, DQ7-ODD
1360                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW4,CHA-B, DQ7-ODD
1361                                         0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F,0x7F7F7F7F, // QW5,CHA-B, DQ7-ODD
1362                                         0x80808080,0x80808080,0x80808080,0x80808080, // QW6,CHA-B, DQ7-ODD
1363                                         0x80808080,0x80808080,0x80808080,0x80808080  // QW7,CHA-B, DQ7-ODD
1364                 };
1365         uint8_t pattern_buf_x[64 * 18 + 16]; // We need to two cache line So have more 16 bytes to keep 16 byte alignment */
1366         uint8_t *buf_a;
1367
1368         unsigned pattern;
1369         uint32_t dword;
1370         uint32_t ecc_bit;
1371         unsigned Errors;
1372         unsigned channel;
1373         int i;
1374         unsigned DQSWrDelay;
1375         unsigned is_Width128 = sysinfo->meminfo[ctrl->node_id].is_Width128;
1376         uint8_t *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
1377
1378         //enable SSE2
1379         enable_sse2();
1380
1381         //wrap32dis
1382         set_wrap32dis();
1383
1384         //disable ECC temp
1385         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
1386         ecc_bit = dword & DCL_DimmEccEn;
1387         dword &= ~(DCL_DimmEccEn);
1388         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
1389
1390         //SetupDqsPattern
1391         buf_a = (uint8_t *)(((uint32_t)(&pattern_buf_x[0]) + 0x10) & (~0xf));
1392
1393         if(is_Width128){
1394                 pattern = 1;
1395                 for(i=0;i<16*18;i++) {
1396                         *((uint32_t *)(buf_a + i*4)) = TestPatternJD1b[i];
1397                  }
1398         }
1399         else {
1400                 pattern = 0;
1401                 for(i=0; i<16*9;i++) {
1402                         *((uint32_t *)(buf_a + i*4)) = TestPatternJD1a[i];
1403                 }
1404
1405         }
1406
1407         print_debug_dqs("\r\nTrainDQSRdWrPos: 0 ctrl ", ctrl->node_id, 0);
1408
1409         printk_debug("TrainDQSRdWrPos: buf_a:%p\n", buf_a);
1410
1411         Errors = 0;
1412         channel = 0;
1413
1414         if (!(sysinfo->meminfo[ctrl->node_id].dimm_mask & 0x0F) &&
1415              (sysinfo->meminfo[ctrl->node_id].dimm_mask & 0xF0)) { /* channelB only? */
1416                 channel = 1;
1417         }
1418
1419         while( (channel<2) && (!Errors)) {
1420                 print_debug_dqs("\tTrainDQSRdWrPos: 1 channel ",channel, 1);
1421                 for(DQSWrDelay = 0; DQSWrDelay < 48; DQSWrDelay++) {
1422                         unsigned err;
1423                         SetDQSDelayAllCSR(ctrl, channel, DQS_WRITEDIR, DQSWrDelay);
1424                         print_debug_dqs("\t\tTrainDQSRdWrPos: 21 DQSWrDelay ", DQSWrDelay, 2);
1425                         err= TrainReadDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
1426                         print_debug_dqs("\t\tTrainDQSRdWrPos: 22 err ",err, 2);
1427                         if(err == 0) break;
1428                         Errors |= err;
1429                 }
1430
1431                 print_debug_dqs("\tTrainDQSRdWrPos: 3 DQSWrDelay ", DQSWrDelay, 1);
1432
1433                 if(DQSWrDelay < 48) {
1434                         Errors = TrainWriteDQS(ctrl, channel, pattern, buf_a, dqs_delay_a, sysinfo);
1435                         print_debug_dqs("\tTrainDQSRdWrPos: 4 Errors ", Errors, 1);
1436
1437                 }
1438                 channel++;
1439                 if(!is_Width128){
1440                         //FIXME: 64MuxMode??
1441                         channel++; // skip channel if 64-bit mode
1442                 }
1443         }
1444
1445         //Enable ECC again
1446         dword = pci_read_config32(ctrl->f2, DRAM_CONFIG_LOW);
1447         dword &= ~(DCL_DimmEccEn);
1448         dword |= ecc_bit;
1449         pci_write_config32(ctrl->f2, DRAM_CONFIG_LOW, dword);
1450
1451         //Clear wrap32dis
1452
1453         clear_wrap32dis();
1454
1455         //restore SSE2 setting
1456         disable_sse2();
1457
1458         print_debug_dqs("TrainDQSRdWrPos: ", 5, 0);
1459
1460         return Errors;
1461
1462 }
1463 static inline uint8_t get_dqs_delay(unsigned channel, unsigned bytelane, unsigned direction, uint8_t *dqs_delay_a)
1464 {
1465         return dqs_delay_a[channel * 2*9 + direction * 9 + bytelane];
1466 }
1467
1468 static unsigned CalcEccDQSPos(unsigned channel,unsigned ByteLane0, unsigned ByteLane1, unsigned InterFactor, unsigned Direction, uint8_t *dqs_delay_a)
1469 /* InterFactor: 0: 100% ByteLane 0
1470                 0x80: 50% between ByteLane 0 and 1
1471                 0xff: 99.6% ByteLane 1 and 0.4% like 0
1472 */
1473 {
1474         unsigned DQSDelay0, DQSDelay1;
1475         unsigned DQSDelay;
1476
1477         DQSDelay0 = get_dqs_delay(channel, ByteLane0, Direction, dqs_delay_a);
1478         DQSDelay1 = get_dqs_delay(channel, ByteLane1, Direction, dqs_delay_a);
1479
1480         if(DQSDelay0>DQSDelay1) {
1481                 DQSDelay = DQSDelay0 - DQSDelay1;
1482                 InterFactor = 0xff - InterFactor;
1483         }
1484         else {
1485                 DQSDelay = DQSDelay1 - DQSDelay0;
1486         }
1487
1488         DQSDelay *= InterFactor;
1489
1490         DQSDelay >>= 8; // /255
1491
1492         if(DQSDelay0>DQSDelay1) {
1493                 DQSDelay += DQSDelay1;
1494         }
1495         else {
1496                 DQSDelay += DQSDelay0;
1497         }
1498
1499         return DQSDelay;
1500
1501 }
1502
1503 static void SetEccDQSRdWrPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1504 {
1505         unsigned channel;
1506         unsigned ByteLane;
1507         unsigned Direction;
1508         unsigned lane0, lane1, ratio;
1509         unsigned dqs_delay;
1510
1511         unsigned direction[] = { DQS_READDIR, DQS_WRITEDIR };
1512         int i;
1513         uint8_t *dqs_delay_a = &sysinfo->dqs_delay_a[ctrl->node_id * 2*2*9]; //channel 2, direction 2 , bytelane *9
1514
1515         ByteLane = 8;
1516
1517         for(channel = 0; channel < 2; channel++) {
1518                 for(i=0;i<2;i++) {
1519                         Direction = direction[i];
1520                         lane0 = 4; lane1 = 5; ratio = 0;
1521                         dqs_delay = CalcEccDQSPos(channel, lane0, lane1, ratio, Direction, dqs_delay_a);
1522                         print_debug_dqs_pair("\t\tSetEccDQSRdWrPos: channel ", channel, Direction==DQS_READDIR? " R dqs_delay":" W dqs_delay",  dqs_delay, 2);
1523                         SetDQSDelayCSR(ctrl, channel, ByteLane, Direction, dqs_delay);
1524                         save_dqs_delay(channel, ByteLane, Direction, dqs_delay_a, dqs_delay);
1525                 }
1526         }
1527 }
1528
1529 static unsigned train_DqsRcvrEn(const struct mem_controller *ctrl, unsigned Pass, struct sys_info *sysinfo)
1530 {
1531         print_debug_dqs("\r\ntrain_DqsRcvrEn: begin ctrl ", ctrl->node_id, 0);
1532         if(TrainRcvrEn(ctrl, Pass, sysinfo)) {
1533                 return 1;
1534         }
1535         print_debug_dqs("\r\ntrain_DqsRcvrEn: end ctrl ", ctrl->node_id, 0);
1536         return 0;
1537
1538 }
1539 static unsigned train_DqsPos(const struct mem_controller *ctrl, struct sys_info *sysinfo)
1540 {
1541         print_debug_dqs("\r\ntrain_DqsPos: begin ctrl ", ctrl->node_id, 0);
1542         if(TrainDQSRdWrPos(ctrl, sysinfo) != 0) {
1543                 printk_err("\r\nDQS Training Rd Wr failed ctrl%02x\r\n", ctrl->node_id);
1544                 return 1;
1545         }
1546         else {
1547                 SetEccDQSRdWrPos(ctrl, sysinfo);
1548         }
1549         print_debug_dqs("\r\ntrain_DqsPos: end ctrl ", ctrl->node_id, 0);
1550         return 0;
1551
1552 }
1553
1554 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1555 static void f0_svm_workaround(int controllers, const struct mem_controller *ctrl, tsc_t *tsc0, struct sys_info *sysinfo)
1556 {
1557         tsc_t tsc1[8];
1558         unsigned cpu_f0_f1[8];
1559         int i;
1560
1561         print_debug_addr("dqs_timing: tsc1[8] :", tsc1);
1562
1563         for(i = 0; i < controllers; i++) {
1564                 if (!sysinfo->ctrl_present[i])
1565                         continue;
1566
1567                 /* Skip everything if I don't have any memory on this controller */
1568                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1569
1570                 uint32_t dword;
1571
1572                 cpu_f0_f1[i] = is_cpu_pre_f2_in_bsp(i);
1573
1574                 if(!cpu_f0_f1[i]) continue;
1575
1576                 dword = pci_read_config32(ctrl[i].f2, DRAM_CTRL);
1577                 dword &= ~DC_DqsRcvEnTrain;
1578                 pci_write_config32(ctrl[i].f2, DRAM_CTRL, dword);
1579
1580                 dword = pci_read_config32(ctrl[i].f2, DRAM_INIT);
1581                 dword |= DI_EnDramInit;
1582                 pci_write_config32(ctrl[i].f2, DRAM_INIT, dword);
1583                 dword &= ~DI_EnDramInit;
1584                 pci_write_config32(ctrl[i].f2, DRAM_INIT, dword);
1585
1586                 tsc1[i] = rdtsc();
1587                 print_debug_dqs_tsc("begin: tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
1588
1589                 dword = tsc1[i].lo + tsc0[i].lo;
1590                 if((dword<tsc1[i].lo) || (dword<tsc0[i].lo)) {
1591                         tsc1[i].hi++;
1592                 }
1593                 tsc1[i].lo = dword;
1594                 tsc1[i].hi+= tsc0[i].hi;
1595
1596                 print_debug_dqs_tsc("end  : tsc1", i, tsc1[i].hi, tsc1[i].lo, 2);
1597
1598         }
1599
1600         for(i = 0; i < controllers; i++) {
1601                 if (!sysinfo->ctrl_present[i])
1602                         continue;
1603
1604                 /* Skip everything if I don't have any memory on this controller */
1605                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1606
1607                 if(!cpu_f0_f1[i]) continue;
1608
1609                 tsc_t tsc;
1610
1611                 do {
1612                         tsc = rdtsc();
1613                 } while ((tsc1[i].hi>tsc.hi) || ((tsc1[i].hi==tsc.hi) && (tsc1[i].lo>tsc.lo)));
1614
1615                 print_debug_dqs_tsc("end  : tsc ", i, tsc.hi, tsc.lo, 2);
1616         }
1617
1618 }
1619
1620 #endif
1621
1622
1623 /* setting variable mtrr, comes from linux kernel source */
1624 static void set_var_mtrr_dqs(
1625         unsigned int reg, unsigned long basek, unsigned long sizek,
1626         unsigned char type, unsigned address_bits)
1627 {
1628         msr_t base, mask;
1629         unsigned address_mask_high;
1630
1631         address_mask_high = ((1u << (address_bits - 32u)) - 1u);
1632
1633         base.hi = basek >> 22;
1634         base.lo  = basek << 10;
1635
1636         if (sizek < 4*1024*1024) {
1637                 mask.hi = address_mask_high;
1638                 mask.lo = ~((sizek << 10) -1);
1639         }
1640         else {
1641                 mask.hi = address_mask_high & (~((sizek >> 22) -1));
1642                 mask.lo = 0;
1643         }
1644
1645         if (reg >= 8)
1646                 return;
1647
1648         if (sizek == 0) {
1649                 msr_t zero;
1650                 zero.lo = zero.hi = 0;
1651                 /* The invalid bit is kept in the mask, so we simply clear the
1652                    relevant mask register to disable a range. */
1653                 wrmsr (MTRRphysMask_MSR(reg), zero);
1654         } else {
1655                 /* Bit 32-35 of MTRRphysMask should be set to 1 */
1656                 base.lo |= type;
1657                 mask.lo |= 0x800;
1658                 wrmsr (MTRRphysBase_MSR(reg), base);
1659                 wrmsr (MTRRphysMask_MSR(reg), mask);
1660         }
1661 }
1662
1663
1664 /* fms: find most sigificant bit set, stolen from Linux Kernel Source. */
1665 static inline unsigned int fms(unsigned int x)
1666 {
1667         int r;
1668
1669         __asm__("bsrl %1,%0\n\t"
1670                 "jnz 1f\n\t"
1671                 "movl $0,%0\n"
1672                 "1:" : "=r" (r) : "g" (x));
1673         return r;
1674 }
1675
1676 /* fls: find least sigificant bit set */
1677 static inline unsigned int fls(unsigned int x)
1678 {
1679         int r;
1680
1681         __asm__("bsfl %1,%0\n\t"
1682                 "jnz 1f\n\t"
1683                 "movl $32,%0\n"
1684                 "1:" : "=r" (r) : "g" (x));
1685         return r;
1686 }
1687
1688 static unsigned int range_to_mtrr(unsigned int reg,
1689         unsigned long range_startk, unsigned long range_sizek,
1690         unsigned long next_range_startk, unsigned char type, unsigned address_bits)
1691 {
1692         if (!range_sizek || (reg >= 8)) {
1693                 return reg;
1694         }
1695         while(range_sizek) {
1696                 unsigned long max_align, align;
1697                 unsigned long sizek;
1698                 /* Compute the maximum size I can make a range */
1699                 max_align = fls(range_startk);
1700                 align = fms(range_sizek);
1701                 if (align > max_align) {
1702                         align = max_align;
1703                 }
1704                 sizek = 1 << align;
1705 #if MEM_TRAIN_SEQ != 1
1706                 printk_debug("Setting variable MTRR %d, base: %4dMB, range: %4dMB, type %s\r\n",
1707                         reg, range_startk >>10, sizek >> 10,
1708                         (type==MTRR_TYPE_UNCACHEABLE)?"UC":
1709                             ((type==MTRR_TYPE_WRBACK)?"WB":"Other")
1710                         );
1711 #endif
1712                 set_var_mtrr_dqs(reg++, range_startk, sizek, type, address_bits);
1713                 range_startk += sizek;
1714                 range_sizek -= sizek;
1715                 if (reg >= 8)
1716                         break;
1717         }
1718         return reg;
1719 }
1720
1721 static void set_top_mem_ap(unsigned tom_k, unsigned tom2_k)
1722 {
1723         msr_t msr;
1724
1725         /* Now set top of memory */
1726         msr.lo = (tom2_k & 0x003fffff) << 10;
1727         msr.hi = (tom2_k & 0xffc00000) >> 22;
1728         wrmsr(TOP_MEM2, msr);
1729
1730         msr.lo = (tom_k & 0x003fffff) << 10;
1731         msr.hi = (tom_k & 0xffc00000) >> 22;
1732         wrmsr(TOP_MEM, msr);
1733 }
1734
1735 static void setup_mtrr_dqs(unsigned tom_k, unsigned tom2_k){
1736         unsigned reg;
1737         msr_t msr;
1738
1739 #if 0
1740         //still enable from cache_as_ram.inc
1741         msr = rdmsr(SYSCFG_MSR);
1742         msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
1743         wrmsr(SYSCFG_MSR,msr);
1744 #endif
1745
1746         //[0,512k), [512k, 640k)
1747         msr.hi = 0x1e1e1e1e;
1748         msr.lo = msr.hi;
1749         wrmsr(0x250, msr);
1750         wrmsr(0x258, msr);
1751
1752         //[1M, TOM)
1753         reg = range_to_mtrr(2, 0, tom_k,4*1024*1024, MTRR_TYPE_WRBACK, 40);
1754
1755         //[4G, TOM2)
1756         if(tom2_k) {
1757                 //enable tom2 and type
1758                 msr = rdmsr(SYSCFG_MSR);
1759                 msr.lo |= (1<<21) | (1<<22); //MtrrTom2En and Tom2ForceMemTypeWB
1760                 wrmsr(SYSCFG_MSR, msr);
1761         }
1762
1763 }
1764
1765 static void clear_mtrr_dqs(unsigned tom2_k){
1766         msr_t msr;
1767         unsigned i;
1768
1769         //still enable from cache_as_ram.inc
1770         msr = rdmsr(SYSCFG_MSR);
1771         msr.lo |= SYSCFG_MSR_MtrrFixDramModEn;
1772         wrmsr(SYSCFG_MSR,msr);
1773
1774         //[0,512k), [512k, 640k)
1775         msr.hi = 0;
1776         msr.lo = msr.hi;
1777         wrmsr(0x250, msr);
1778         wrmsr(0x258, msr);
1779
1780         //[1M, TOM)
1781         for(i=0x204;i<0x210;i++) {
1782                 wrmsr(i, msr);
1783         }
1784
1785         //[4G, TOM2)
1786         if(tom2_k) {
1787                 //enable tom2 and type
1788                 msr = rdmsr(SYSCFG_MSR);
1789                 msr.lo &= ~((1<<21) | (1<<22)); //MtrrTom2En and Tom2ForceMemTypeWB
1790                 wrmsr(SYSCFG_MSR, msr);
1791         }
1792 }
1793
1794 static void set_htic_bit(unsigned i, unsigned val, unsigned bit)
1795 {
1796         uint32_t dword;
1797         dword = pci_read_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL);
1798         dword &= ~(1<<bit);
1799         dword |= ((val & 1) <<bit);
1800         pci_write_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL, dword);
1801 }
1802
1803
1804 static unsigned get_htic_bit(unsigned i, unsigned bit)
1805 {
1806         uint32_t dword;
1807         dword = pci_read_config32(PCI_DEV(0, 0x18+i, 0), HT_INIT_CONTROL);
1808         dword &= (1<<bit);
1809         return dword;
1810 }
1811
1812 static void wait_till_sysinfo_in_ram(void)
1813 {
1814         while(1) {
1815                 if(get_htic_bit(0, 9)) return;
1816         }
1817 }
1818
1819 static void set_sysinfo_in_ram(unsigned val)
1820 {
1821         set_htic_bit(0, val, 9);
1822 }
1823
1824
1825 #if MEM_TRAIN_SEQ == 0
1826
1827
1828 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1829 static void dqs_timing(int controllers, const struct mem_controller *ctrl, tsc_t *tsc0, struct sys_info *sysinfo)
1830 #else
1831 static void dqs_timing(int controllers, const struct mem_controller *ctrl, struct sys_info *sysinfo)
1832 #endif
1833 {
1834         int  i;
1835
1836         tsc_t tsc[5];
1837
1838         //need to enable mtrr, so dqs training could access the test address
1839         setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
1840
1841         for(i = 0; i < controllers; i++) {
1842                 if (!sysinfo->ctrl_present[ i ])
1843                         continue;
1844
1845                 /* Skip everything if I don't have any memory on this controller */
1846                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1847
1848                 fill_mem_cs_sysinfo(i, ctrl+i, sysinfo);
1849         }
1850
1851         tsc[0] = rdtsc();
1852         for(i = 0; i < controllers; i++) {
1853                 if (!sysinfo->ctrl_present[ i ])
1854                         continue;
1855
1856                 /* Skip everything if I don't have any memory on this controller */
1857                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1858
1859                 printk_debug("DQS Training:RcvrEn:Pass1: %02x\n", i);
1860                 if(train_DqsRcvrEn(ctrl+i, 1, sysinfo)) goto out;
1861                 printk_debug(" done\r\n");
1862         }
1863
1864         tsc[1] = rdtsc();
1865 #if K8_REV_F_SUPPORT_F0_F1_WORKAROUND == 1
1866         f0_svm_workaround(controllers, ctrl, tsc0, sysinfo);
1867 #endif
1868
1869         tsc[2] = rdtsc();
1870         for(i = 0; i < controllers; i++) {
1871                 if (!sysinfo->ctrl_present[i])
1872                         continue;
1873
1874                 /* Skip everything if I don't have any memory on this controller */
1875                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1876
1877                 printk_debug("DQS Training:DQSPos: %02x\n", i);
1878                 if(train_DqsPos(ctrl+i, sysinfo)) goto out;
1879                 printk_debug(" done\r\n");
1880         }
1881
1882         tsc[3] = rdtsc();
1883         for(i = 0; i < controllers; i++) {
1884                 if (!sysinfo->ctrl_present[i])
1885                         continue;
1886
1887                 /* Skip everything if I don't have any memory on this controller */
1888                 if(sysinfo->meminfo[i].dimm_mask==0x00) continue;
1889
1890                 printk_debug("DQS Training:RcvrEn:Pass2: %02x\n", i);
1891                 if(train_DqsRcvrEn(ctrl+i, 2, sysinfo)) goto out;
1892                 printk_debug(" done\r\n");
1893                 sysinfo->mem_trained[i]=1;
1894         }
1895
1896 out:
1897         tsc[4] = rdtsc();
1898         clear_mtrr_dqs(sysinfo->tom2_k);
1899
1900
1901         for(i=0;i<5;i++) {
1902                 print_debug_dqs_tsc_x("DQS Training:tsc", i,  tsc[i].hi, tsc[i].lo);
1903         }
1904
1905
1906
1907 }
1908
1909 #endif
1910
1911
1912 #if MEM_TRAIN_SEQ > 0
1913
1914 static void dqs_timing(int i, const struct mem_controller *ctrl, struct sys_info *sysinfo, unsigned v)
1915 {
1916
1917         int ii;
1918
1919          tsc_t tsc[4];
1920
1921         if(sysinfo->mem_trained[i] != 0x80) return;
1922
1923 #if MEM_TRAIN_SEQ == 1
1924         //need to enable mtrr, so dqs training could access the test address
1925         setup_mtrr_dqs(sysinfo->tom_k, sysinfo->tom2_k);
1926 #endif
1927
1928         fill_mem_cs_sysinfo(i, ctrl, sysinfo);
1929
1930         if(v) {
1931                 tsc[0] = rdtsc();
1932
1933                 printk_debug("set DQS timing:RcvrEn:Pass1: %02x\n", i);
1934         }
1935         if(train_DqsRcvrEn(ctrl, 1,  sysinfo)) {
1936                 sysinfo->mem_trained[i]=0x81; //
1937                 goto out;
1938         }
1939
1940         if(v) {
1941                 printk_debug(" done\r\n");
1942                 tsc[1] = rdtsc();
1943                 printk_debug("set DQS timing:DQSPos: %02x\n", i);
1944         }
1945
1946         if(train_DqsPos(ctrl, sysinfo)) {
1947                 sysinfo->mem_trained[i]=0x82; //
1948                 goto out;
1949         }
1950
1951         if(v) {
1952                 printk_debug(" done\r\n");
1953                 tsc[2] = rdtsc();
1954
1955                 printk_debug("set DQS timing:RcvrEn:Pass2: %02x\n", i);
1956         }
1957         if(train_DqsRcvrEn(ctrl, 2,  sysinfo)){
1958                 sysinfo->mem_trained[i]=0x83; //
1959                 goto out;
1960         }
1961
1962         if(v) {
1963                 printk_debug(" done\r\n");
1964
1965                 tsc[3] = rdtsc();
1966         }
1967
1968 out:
1969 #if MEM_TRAIN_SEQ == 1
1970         clear_mtrr_dqs(sysinfo->tom2_k);
1971 #endif
1972
1973         if(v) {
1974                 for(ii=0;ii<4;ii++) {
1975                       print_debug_dqs_tsc_x("Total DQS Training : tsc ", ii,  tsc[ii].hi, tsc[ii].lo);
1976                 }
1977         }
1978
1979         if(sysinfo->mem_trained[i] == 0x80) {
1980                 sysinfo->mem_trained[i]=1;
1981         }
1982
1983 }
1984 #endif
1985
1986 #if MEM_TRAIN_SEQ == 1
1987 static void train_ram(unsigned nodeid, struct sys_info *sysinfo, struct sys_info *sysinfox)
1988 {
1989         dqs_timing(nodeid, &sysinfo->ctrl[nodeid], sysinfo, 0); // keep the output tidy
1990 //      memcpy(&sysinfox->dqs_rcvr_dly_a[nodeid * 2 * 8],&sysinfo->dqs_rcvr_dly_a[nodeid * 2 * 8], 2*8);
1991 //      memcpy(&sysinfox->dqs_delay_a[nodeid * 2 * 2 * 9], &sysinfo->dqs_delay_a[nodeid * 2 * 2 * 9], 2 * 2 * 9);
1992         sysinfox->mem_trained[nodeid] = sysinfo->mem_trained[nodeid];
1993
1994 }
1995 static void copy_and_run_ap_code_in_car(unsigned ret_addr);
1996 static inline void train_ram_on_node(unsigned nodeid, unsigned coreid, struct sys_info *sysinfo, unsigned retcall)
1997 {
1998         if(coreid) return; // only do it on core0
1999         struct sys_info *sysinfox = ((CONFIG_LB_MEM_TOPK<<10) - DCACHE_RAM_GLOBAL_VAR_SIZE);
2000         wait_till_sysinfo_in_ram(); // use pci to get it
2001
2002         if(sysinfox->mem_trained[nodeid] == 0x80) {
2003         #if 0
2004                 sysinfo->tom_k = sysinfox->tom_k;
2005                 sysinfo->tom2_k = sysinfox->tom2_k;
2006                 sysinfo->meminfo[nodeid].is_Width128 = sysinfox->meminfo[nodeid].is_Width128;
2007                 sysinfo->mem_trained[nodeid] = sysinfox->mem_trained[nodeid];
2008                 memcpy(&sysinfo->ctrl[nodeid], &sysinfox->ctrl[nodeid], sizeof(struct mem_controller));
2009         #else
2010                 memcpy(sysinfo, sysinfox, DCACHE_RAM_GLOBAL_VAR_SIZE);
2011         #endif
2012                 set_top_mem_ap(sysinfo->tom_k, sysinfo->tom2_k); // keep the ap's tom consistent with bsp's
2013         #if CONFIG_AP_CODE_IN_CAR == 0
2014                 printk_debug("CODE IN ROM AND RUN ON NODE: %02x\n", nodeid);
2015                 train_ram(nodeid, sysinfo, sysinfox);
2016         #else
2017                 /* Can copy dqs_timing to ap cache and run from cache?
2018                 * we need coreboot_ap_car.rom? and treat it as coreboot_ram.rom for ap ?
2019                 */
2020                 copy_and_run_ap_code_in_car(retcall);
2021                 // will go back by jump
2022         #endif
2023         }
2024 }
2025 #endif