GNU Linux-libre 5.10.217-gnu1
[releases.git] / drivers / thermal / intel / intel_powerclamp.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *      TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *           2. synchronization with other hw blocks
23  */
24
25 #define pr_fmt(fmt)     KBUILD_MODNAME ": " fmt
26
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/kthread.h>
31 #include <linux/cpu.h>
32 #include <linux/thermal.h>
33 #include <linux/slab.h>
34 #include <linux/tick.h>
35 #include <linux/debugfs.h>
36 #include <linux/seq_file.h>
37 #include <linux/sched/rt.h>
38 #include <uapi/linux/sched/types.h>
39
40 #include <asm/nmi.h>
41 #include <asm/msr.h>
42 #include <asm/mwait.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/hardirq.h>
45
46 #define MAX_TARGET_RATIO (50U)
47 /* For each undisturbed clamping period (no extra wake ups during idle time),
48  * we increment the confidence counter for the given target ratio.
49  * CONFIDENCE_OK defines the level where runtime calibration results are
50  * valid.
51  */
52 #define CONFIDENCE_OK (3)
53 /* Default idle injection duration, driver adjust sleep time to meet target
54  * idle ratio. Similar to frequency modulation.
55  */
56 #define DEFAULT_DURATION_JIFFIES (6)
57
58 static unsigned int target_mwait;
59 static struct dentry *debug_dir;
60 static bool poll_pkg_cstate_enable;
61
62 /* user selected target */
63 static unsigned int set_target_ratio;
64 static unsigned int current_ratio;
65 static bool should_skip;
66 static bool reduce_irq;
67 static atomic_t idle_wakeup_counter;
68 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
69                                   * control parameters. default to BSP but BSP
70                                   * can be offlined.
71                                   */
72 static bool clamping;
73
74 struct powerclamp_worker_data {
75         struct kthread_worker *worker;
76         struct kthread_work balancing_work;
77         struct kthread_delayed_work idle_injection_work;
78         unsigned int cpu;
79         unsigned int count;
80         unsigned int guard;
81         unsigned int window_size_now;
82         unsigned int target_ratio;
83         unsigned int duration_jiffies;
84         bool clamping;
85 };
86
87 static struct powerclamp_worker_data __percpu *worker_data;
88 static struct thermal_cooling_device *cooling_dev;
89 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
90                                            * clamping kthread worker
91                                            */
92
93 static unsigned int duration;
94 static unsigned int pkg_cstate_ratio_cur;
95 static unsigned int window_size;
96
97 static int duration_set(const char *arg, const struct kernel_param *kp)
98 {
99         int ret = 0;
100         unsigned long new_duration;
101
102         ret = kstrtoul(arg, 10, &new_duration);
103         if (ret)
104                 goto exit;
105         if (new_duration > 25 || new_duration < 6) {
106                 pr_err("Out of recommended range %lu, between 6-25ms\n",
107                         new_duration);
108                 ret = -EINVAL;
109         }
110
111         duration = clamp(new_duration, 6ul, 25ul);
112         smp_mb();
113
114 exit:
115
116         return ret;
117 }
118
119 static const struct kernel_param_ops duration_ops = {
120         .set = duration_set,
121         .get = param_get_int,
122 };
123
124
125 module_param_cb(duration, &duration_ops, &duration, 0644);
126 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
127
128 struct powerclamp_calibration_data {
129         unsigned long confidence;  /* used for calibration, basically a counter
130                                     * gets incremented each time a clamping
131                                     * period is completed without extra wakeups
132                                     * once that counter is reached given level,
133                                     * compensation is deemed usable.
134                                     */
135         unsigned long steady_comp; /* steady state compensation used when
136                                     * no extra wakeups occurred.
137                                     */
138         unsigned long dynamic_comp; /* compensate excessive wakeup from idle
139                                      * mostly from external interrupts.
140                                      */
141 };
142
143 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
144
145 static int window_size_set(const char *arg, const struct kernel_param *kp)
146 {
147         int ret = 0;
148         unsigned long new_window_size;
149
150         ret = kstrtoul(arg, 10, &new_window_size);
151         if (ret)
152                 goto exit_win;
153         if (new_window_size > 10 || new_window_size < 2) {
154                 pr_err("Out of recommended window size %lu, between 2-10\n",
155                         new_window_size);
156                 ret = -EINVAL;
157         }
158
159         window_size = clamp(new_window_size, 2ul, 10ul);
160         smp_mb();
161
162 exit_win:
163
164         return ret;
165 }
166
167 static const struct kernel_param_ops window_size_ops = {
168         .set = window_size_set,
169         .get = param_get_int,
170 };
171
172 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
173 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
174         "\tpowerclamp controls idle ratio within this window. larger\n"
175         "\twindow size results in slower response time but more smooth\n"
176         "\tclamping results. default to 2.");
177
178 static void find_target_mwait(void)
179 {
180         unsigned int eax, ebx, ecx, edx;
181         unsigned int highest_cstate = 0;
182         unsigned int highest_subcstate = 0;
183         int i;
184
185         if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
186                 return;
187
188         cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
189
190         if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
191             !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
192                 return;
193
194         edx >>= MWAIT_SUBSTATE_SIZE;
195         for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
196                 if (edx & MWAIT_SUBSTATE_MASK) {
197                         highest_cstate = i;
198                         highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
199                 }
200         }
201         target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
202                 (highest_subcstate - 1);
203
204 }
205
206 struct pkg_cstate_info {
207         bool skip;
208         int msr_index;
209         int cstate_id;
210 };
211
212 #define PKG_CSTATE_INIT(id) {                           \
213                 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
214                 .cstate_id = id                         \
215                         }
216
217 static struct pkg_cstate_info pkg_cstates[] = {
218         PKG_CSTATE_INIT(2),
219         PKG_CSTATE_INIT(3),
220         PKG_CSTATE_INIT(6),
221         PKG_CSTATE_INIT(7),
222         PKG_CSTATE_INIT(8),
223         PKG_CSTATE_INIT(9),
224         PKG_CSTATE_INIT(10),
225         {NULL},
226 };
227
228 static bool has_pkg_state_counter(void)
229 {
230         u64 val;
231         struct pkg_cstate_info *info = pkg_cstates;
232
233         /* check if any one of the counter msrs exists */
234         while (info->msr_index) {
235                 if (!rdmsrl_safe(info->msr_index, &val))
236                         return true;
237                 info++;
238         }
239
240         return false;
241 }
242
243 static u64 pkg_state_counter(void)
244 {
245         u64 val;
246         u64 count = 0;
247         struct pkg_cstate_info *info = pkg_cstates;
248
249         while (info->msr_index) {
250                 if (!info->skip) {
251                         if (!rdmsrl_safe(info->msr_index, &val))
252                                 count += val;
253                         else
254                                 info->skip = true;
255                 }
256                 info++;
257         }
258
259         return count;
260 }
261
262 static unsigned int get_compensation(int ratio)
263 {
264         unsigned int comp = 0;
265
266         if (!poll_pkg_cstate_enable)
267                 return 0;
268
269         /* we only use compensation if all adjacent ones are good */
270         if (ratio == 1 &&
271                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
272                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
273                 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
274                 comp = (cal_data[ratio].steady_comp +
275                         cal_data[ratio + 1].steady_comp +
276                         cal_data[ratio + 2].steady_comp) / 3;
277         } else if (ratio == MAX_TARGET_RATIO - 1 &&
278                 cal_data[ratio].confidence >= CONFIDENCE_OK &&
279                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
280                 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
281                 comp = (cal_data[ratio].steady_comp +
282                         cal_data[ratio - 1].steady_comp +
283                         cal_data[ratio - 2].steady_comp) / 3;
284         } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
285                 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
286                 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
287                 comp = (cal_data[ratio].steady_comp +
288                         cal_data[ratio - 1].steady_comp +
289                         cal_data[ratio + 1].steady_comp) / 3;
290         }
291
292         /* REVISIT: simple penalty of double idle injection */
293         if (reduce_irq)
294                 comp = ratio;
295         /* do not exceed limit */
296         if (comp + ratio >= MAX_TARGET_RATIO)
297                 comp = MAX_TARGET_RATIO - ratio - 1;
298
299         return comp;
300 }
301
302 static void adjust_compensation(int target_ratio, unsigned int win)
303 {
304         int delta;
305         struct powerclamp_calibration_data *d = &cal_data[target_ratio];
306
307         /*
308          * adjust compensations if confidence level has not been reached or
309          * there are too many wakeups during the last idle injection period, we
310          * cannot trust the data for compensation.
311          */
312         if (d->confidence >= CONFIDENCE_OK ||
313                 atomic_read(&idle_wakeup_counter) >
314                 win * num_online_cpus())
315                 return;
316
317         delta = set_target_ratio - current_ratio;
318         /* filter out bad data */
319         if (delta >= 0 && delta <= (1+target_ratio/10)) {
320                 if (d->steady_comp)
321                         d->steady_comp =
322                                 roundup(delta+d->steady_comp, 2)/2;
323                 else
324                         d->steady_comp = delta;
325                 d->confidence++;
326         }
327 }
328
329 static bool powerclamp_adjust_controls(unsigned int target_ratio,
330                                 unsigned int guard, unsigned int win)
331 {
332         static u64 msr_last, tsc_last;
333         u64 msr_now, tsc_now;
334         u64 val64;
335
336         /* check result for the last window */
337         msr_now = pkg_state_counter();
338         tsc_now = rdtsc();
339
340         /* calculate pkg cstate vs tsc ratio */
341         if (!msr_last || !tsc_last)
342                 current_ratio = 1;
343         else if (tsc_now-tsc_last) {
344                 val64 = 100*(msr_now-msr_last);
345                 do_div(val64, (tsc_now-tsc_last));
346                 current_ratio = val64;
347         }
348
349         /* update record */
350         msr_last = msr_now;
351         tsc_last = tsc_now;
352
353         adjust_compensation(target_ratio, win);
354         /*
355          * too many external interrupts, set flag such
356          * that we can take measure later.
357          */
358         reduce_irq = atomic_read(&idle_wakeup_counter) >=
359                 2 * win * num_online_cpus();
360
361         atomic_set(&idle_wakeup_counter, 0);
362         /* if we are above target+guard, skip */
363         return set_target_ratio + guard <= current_ratio;
364 }
365
366 static void clamp_balancing_func(struct kthread_work *work)
367 {
368         struct powerclamp_worker_data *w_data;
369         int sleeptime;
370         unsigned long target_jiffies;
371         unsigned int compensated_ratio;
372         int interval; /* jiffies to sleep for each attempt */
373
374         w_data = container_of(work, struct powerclamp_worker_data,
375                               balancing_work);
376
377         /*
378          * make sure user selected ratio does not take effect until
379          * the next round. adjust target_ratio if user has changed
380          * target such that we can converge quickly.
381          */
382         w_data->target_ratio = READ_ONCE(set_target_ratio);
383         w_data->guard = 1 + w_data->target_ratio / 20;
384         w_data->window_size_now = window_size;
385         w_data->duration_jiffies = msecs_to_jiffies(duration);
386         w_data->count++;
387
388         /*
389          * systems may have different ability to enter package level
390          * c-states, thus we need to compensate the injected idle ratio
391          * to achieve the actual target reported by the HW.
392          */
393         compensated_ratio = w_data->target_ratio +
394                 get_compensation(w_data->target_ratio);
395         if (compensated_ratio <= 0)
396                 compensated_ratio = 1;
397         interval = w_data->duration_jiffies * 100 / compensated_ratio;
398
399         /* align idle time */
400         target_jiffies = roundup(jiffies, interval);
401         sleeptime = target_jiffies - jiffies;
402         if (sleeptime <= 0)
403                 sleeptime = 1;
404
405         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
406                 kthread_queue_delayed_work(w_data->worker,
407                                            &w_data->idle_injection_work,
408                                            sleeptime);
409 }
410
411 static void clamp_idle_injection_func(struct kthread_work *work)
412 {
413         struct powerclamp_worker_data *w_data;
414
415         w_data = container_of(work, struct powerclamp_worker_data,
416                               idle_injection_work.work);
417
418         /*
419          * only elected controlling cpu can collect stats and update
420          * control parameters.
421          */
422         if (w_data->cpu == control_cpu &&
423             !(w_data->count % w_data->window_size_now)) {
424                 should_skip =
425                         powerclamp_adjust_controls(w_data->target_ratio,
426                                                    w_data->guard,
427                                                    w_data->window_size_now);
428                 smp_mb();
429         }
430
431         if (should_skip)
432                 goto balance;
433
434         play_idle(jiffies_to_usecs(w_data->duration_jiffies));
435
436 balance:
437         if (clamping && w_data->clamping && cpu_online(w_data->cpu))
438                 kthread_queue_work(w_data->worker, &w_data->balancing_work);
439 }
440
441 /*
442  * 1 HZ polling while clamping is active, useful for userspace
443  * to monitor actual idle ratio.
444  */
445 static void poll_pkg_cstate(struct work_struct *dummy);
446 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
447 static void poll_pkg_cstate(struct work_struct *dummy)
448 {
449         static u64 msr_last;
450         static u64 tsc_last;
451
452         u64 msr_now;
453         u64 tsc_now;
454         u64 val64;
455
456         msr_now = pkg_state_counter();
457         tsc_now = rdtsc();
458
459         /* calculate pkg cstate vs tsc ratio */
460         if (!msr_last || !tsc_last)
461                 pkg_cstate_ratio_cur = 1;
462         else {
463                 if (tsc_now - tsc_last) {
464                         val64 = 100 * (msr_now - msr_last);
465                         do_div(val64, (tsc_now - tsc_last));
466                         pkg_cstate_ratio_cur = val64;
467                 }
468         }
469
470         /* update record */
471         msr_last = msr_now;
472         tsc_last = tsc_now;
473
474         if (true == clamping)
475                 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
476 }
477
478 static void start_power_clamp_worker(unsigned long cpu)
479 {
480         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
481         struct kthread_worker *worker;
482
483         worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
484         if (IS_ERR(worker))
485                 return;
486
487         w_data->worker = worker;
488         w_data->count = 0;
489         w_data->cpu = cpu;
490         w_data->clamping = true;
491         set_bit(cpu, cpu_clamping_mask);
492         sched_set_fifo(worker->task);
493         kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
494         kthread_init_delayed_work(&w_data->idle_injection_work,
495                                   clamp_idle_injection_func);
496         kthread_queue_work(w_data->worker, &w_data->balancing_work);
497 }
498
499 static void stop_power_clamp_worker(unsigned long cpu)
500 {
501         struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
502
503         if (!w_data->worker)
504                 return;
505
506         w_data->clamping = false;
507         /*
508          * Make sure that all works that get queued after this point see
509          * the clamping disabled. The counter part is not needed because
510          * there is an implicit memory barrier when the queued work
511          * is proceed.
512          */
513         smp_wmb();
514         kthread_cancel_work_sync(&w_data->balancing_work);
515         kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
516         /*
517          * The balancing work still might be queued here because
518          * the handling of the "clapming" variable, cancel, and queue
519          * operations are not synchronized via a lock. But it is not
520          * a big deal. The balancing work is fast and destroy kthread
521          * will wait for it.
522          */
523         clear_bit(w_data->cpu, cpu_clamping_mask);
524         kthread_destroy_worker(w_data->worker);
525
526         w_data->worker = NULL;
527 }
528
529 static int start_power_clamp(void)
530 {
531         unsigned long cpu;
532
533         set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
534         /* prevent cpu hotplug */
535         get_online_cpus();
536
537         /* prefer BSP */
538         control_cpu = cpumask_first(cpu_online_mask);
539
540         clamping = true;
541         if (poll_pkg_cstate_enable)
542                 schedule_delayed_work(&poll_pkg_cstate_work, 0);
543
544         /* start one kthread worker per online cpu */
545         for_each_online_cpu(cpu) {
546                 start_power_clamp_worker(cpu);
547         }
548         put_online_cpus();
549
550         return 0;
551 }
552
553 static void end_power_clamp(void)
554 {
555         int i;
556
557         /*
558          * Block requeuing in all the kthread workers. They will flush and
559          * stop faster.
560          */
561         clamping = false;
562         if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
563                 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
564                         pr_debug("clamping worker for cpu %d alive, destroy\n",
565                                  i);
566                         stop_power_clamp_worker(i);
567                 }
568         }
569 }
570
571 static int powerclamp_cpu_online(unsigned int cpu)
572 {
573         if (clamping == false)
574                 return 0;
575         start_power_clamp_worker(cpu);
576         /* prefer BSP as controlling CPU */
577         if (cpu == 0) {
578                 control_cpu = 0;
579                 smp_mb();
580         }
581         return 0;
582 }
583
584 static int powerclamp_cpu_predown(unsigned int cpu)
585 {
586         if (clamping == false)
587                 return 0;
588
589         stop_power_clamp_worker(cpu);
590         if (cpu != control_cpu)
591                 return 0;
592
593         control_cpu = cpumask_first(cpu_online_mask);
594         if (control_cpu == cpu)
595                 control_cpu = cpumask_next(cpu, cpu_online_mask);
596         smp_mb();
597         return 0;
598 }
599
600 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
601                                  unsigned long *state)
602 {
603         *state = MAX_TARGET_RATIO;
604
605         return 0;
606 }
607
608 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
609                                  unsigned long *state)
610 {
611         if (clamping) {
612                 if (poll_pkg_cstate_enable)
613                         *state = pkg_cstate_ratio_cur;
614                 else
615                         *state = set_target_ratio;
616         } else {
617                 /* to save power, do not poll idle ratio while not clamping */
618                 *state = -1; /* indicates invalid state */
619         }
620
621         return 0;
622 }
623
624 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
625                                  unsigned long new_target_ratio)
626 {
627         int ret = 0;
628
629         new_target_ratio = clamp(new_target_ratio, 0UL,
630                                 (unsigned long) (MAX_TARGET_RATIO-1));
631         if (set_target_ratio == 0 && new_target_ratio > 0) {
632                 pr_info("Start idle injection to reduce power\n");
633                 set_target_ratio = new_target_ratio;
634                 ret = start_power_clamp();
635                 goto exit_set;
636         } else  if (set_target_ratio > 0 && new_target_ratio == 0) {
637                 pr_info("Stop forced idle injection\n");
638                 end_power_clamp();
639                 set_target_ratio = 0;
640         } else  /* adjust currently running */ {
641                 set_target_ratio = new_target_ratio;
642                 /* make new set_target_ratio visible to other cpus */
643                 smp_mb();
644         }
645
646 exit_set:
647         return ret;
648 }
649
650 /* bind to generic thermal layer as cooling device*/
651 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
652         .get_max_state = powerclamp_get_max_state,
653         .get_cur_state = powerclamp_get_cur_state,
654         .set_cur_state = powerclamp_set_cur_state,
655 };
656
657 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
658         X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
659         {}
660 };
661 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
662
663 static int __init powerclamp_probe(void)
664 {
665
666         if (!x86_match_cpu(intel_powerclamp_ids)) {
667                 pr_err("CPU does not support MWAIT\n");
668                 return -ENODEV;
669         }
670
671         /* The goal for idle time alignment is to achieve package cstate. */
672         if (!has_pkg_state_counter()) {
673                 pr_info("No package C-state available\n");
674                 return -ENODEV;
675         }
676
677         /* find the deepest mwait value */
678         find_target_mwait();
679
680         return 0;
681 }
682
683 static int powerclamp_debug_show(struct seq_file *m, void *unused)
684 {
685         int i = 0;
686
687         seq_printf(m, "controlling cpu: %d\n", control_cpu);
688         seq_printf(m, "pct confidence steady dynamic (compensation)\n");
689         for (i = 0; i < MAX_TARGET_RATIO; i++) {
690                 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
691                         i,
692                         cal_data[i].confidence,
693                         cal_data[i].steady_comp,
694                         cal_data[i].dynamic_comp);
695         }
696
697         return 0;
698 }
699
700 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
701
702 static inline void powerclamp_create_debug_files(void)
703 {
704         debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
705
706         debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
707                             &powerclamp_debug_fops);
708 }
709
710 static enum cpuhp_state hp_state;
711
712 static int __init powerclamp_init(void)
713 {
714         int retval;
715         int bitmap_size;
716
717         bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
718         cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
719         if (!cpu_clamping_mask)
720                 return -ENOMEM;
721
722         /* probe cpu features and ids here */
723         retval = powerclamp_probe();
724         if (retval)
725                 goto exit_free;
726
727         /* set default limit, maybe adjusted during runtime based on feedback */
728         window_size = 2;
729         retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
730                                            "thermal/intel_powerclamp:online",
731                                            powerclamp_cpu_online,
732                                            powerclamp_cpu_predown);
733         if (retval < 0)
734                 goto exit_free;
735
736         hp_state = retval;
737
738         worker_data = alloc_percpu(struct powerclamp_worker_data);
739         if (!worker_data) {
740                 retval = -ENOMEM;
741                 goto exit_unregister;
742         }
743
744         if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
745                 poll_pkg_cstate_enable = true;
746
747         cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
748                                                 &powerclamp_cooling_ops);
749         if (IS_ERR(cooling_dev)) {
750                 retval = -ENODEV;
751                 goto exit_free_thread;
752         }
753
754         if (!duration)
755                 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
756
757         powerclamp_create_debug_files();
758
759         return 0;
760
761 exit_free_thread:
762         free_percpu(worker_data);
763 exit_unregister:
764         cpuhp_remove_state_nocalls(hp_state);
765 exit_free:
766         kfree(cpu_clamping_mask);
767         return retval;
768 }
769 module_init(powerclamp_init);
770
771 static void __exit powerclamp_exit(void)
772 {
773         end_power_clamp();
774         cpuhp_remove_state_nocalls(hp_state);
775         free_percpu(worker_data);
776         thermal_cooling_device_unregister(cooling_dev);
777         kfree(cpu_clamping_mask);
778
779         cancel_delayed_work_sync(&poll_pkg_cstate_work);
780         debugfs_remove_recursive(debug_dir);
781 }
782 module_exit(powerclamp_exit);
783
784 MODULE_LICENSE("GPL");
785 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
786 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
787 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");