diff options
Diffstat (limited to 'drivers/misc/habanalabs/common/habanalabs.h')
-rw-r--r-- | drivers/misc/habanalabs/common/habanalabs.h | 109 |
1 files changed, 98 insertions, 11 deletions
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 41af347157e0..d933878b24d1 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -28,17 +28,18 @@ #define HL_NAME "habanalabs" /* Use upper bits of mmap offset to store habana driver specific information. - * bits[63:62] - Encode mmap type + * bits[63:61] - Encode mmap type * bits[45:0] - mmap offset value * * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these * defines are w.r.t to PAGE_SIZE */ -#define HL_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT) -#define HL_MMAP_TYPE_MASK (0x3ull << HL_MMAP_TYPE_SHIFT) +#define HL_MMAP_TYPE_SHIFT (61 - PAGE_SHIFT) +#define HL_MMAP_TYPE_MASK (0x7ull << HL_MMAP_TYPE_SHIFT) +#define HL_MMAP_TYPE_BLOCK (0x4ull << HL_MMAP_TYPE_SHIFT) #define HL_MMAP_TYPE_CB (0x2ull << HL_MMAP_TYPE_SHIFT) -#define HL_MMAP_OFFSET_VALUE_MASK (0x3FFFFFFFFFFFull >> PAGE_SHIFT) +#define HL_MMAP_OFFSET_VALUE_MASK (0x1FFFFFFFFFFFull >> PAGE_SHIFT) #define HL_MMAP_OFFSET_VALUE_GET(off) (off & HL_MMAP_OFFSET_VALUE_MASK) #define HL_PENDING_RESET_PER_SEC 10 @@ -408,6 +409,9 @@ struct hl_mmu_properties { * @sync_stream_first_mon: first monitor available for sync stream use * @first_available_user_sob: first sob available for the user * @first_available_user_mon: first monitor available for the user + * @first_available_user_msix_interrupt: first available msix interrupt + * reserved for the user + * @first_available_cq: first available CQ for the user. * @tpc_enabled_mask: which TPCs are enabled. * @completion_queues_count: number of completion queues. * @fw_security_disabled: true if security measures are disabled in firmware, @@ -416,6 +420,7 @@ struct hl_mmu_properties { * from BOOT_DEV_STS0 * @dram_supports_virtual_memory: is there an MMU towards the DRAM * @hard_reset_done_by_fw: true if firmware is handling hard reset flow + * @num_functional_hbms: number of functional HBMs in each DCORE. */ struct asic_fixed_properties { struct hw_queue_properties *hw_queues_props; @@ -468,18 +473,22 @@ struct asic_fixed_properties { u16 sync_stream_first_mon; u16 first_available_user_sob[HL_MAX_DCORES]; u16 first_available_user_mon[HL_MAX_DCORES]; + u16 first_available_user_msix_interrupt; + u16 first_available_cq[HL_MAX_DCORES]; u8 tpc_enabled_mask; u8 completion_queues_count; u8 fw_security_disabled; u8 fw_security_status_valid; u8 dram_supports_virtual_memory; u8 hard_reset_done_by_fw; + u8 num_functional_hbms; }; /** * struct hl_fence - software synchronization primitive * @completion: fence is implemented using completion * @refcount: refcount for this fence + * @cs_sequence: sequence of the corresponding command submission * @error: mark this fence with error * @timestamp: timestamp upon completion * @@ -487,6 +496,7 @@ struct asic_fixed_properties { struct hl_fence { struct completion completion; struct kref refcount; + u64 cs_sequence; int error; ktime_t timestamp; }; @@ -846,6 +856,19 @@ enum div_select_defs { * @collective_wait_init_cs: Generate collective master/slave packets * and place them in the relevant cs jobs * @collective_wait_create_jobs: allocate collective wait cs jobs + * @scramble_addr: Routine to scramble the address prior of mapping it + * in the MMU. + * @descramble_addr: Routine to de-scramble the address prior of + * showing it to users. + * @ack_protection_bits_errors: ack and dump all security violations + * @get_hw_block_id: retrieve a HW block id to be used by the user to mmap it. + * also returns the size of the block if caller supplies + * a valid pointer for it + * @hw_block_mmap: mmap a HW block with a given id. + * @enable_events_from_fw: send interrupt to firmware to notify them the + * driver is ready to receive asynchronous events. This + * function should be called during the first init and + * after every hard-reset of the device */ struct hl_asic_funcs { int (*early_init)(struct hl_device *hdev); @@ -918,8 +941,8 @@ struct hl_asic_funcs { void (*set_clock_gating)(struct hl_device *hdev); void (*disable_clock_gating)(struct hl_device *hdev); int (*debug_coresight)(struct hl_device *hdev, void *data); - bool (*is_device_idle)(struct hl_device *hdev, u64 *mask, - struct seq_file *s); + bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr, + u8 mask_len, struct seq_file *s); int (*soft_reset_late_init)(struct hl_device *hdev); void (*hw_queues_lock)(struct hl_device *hdev); void (*hw_queues_unlock)(struct hl_device *hdev); @@ -955,6 +978,14 @@ struct hl_asic_funcs { int (*collective_wait_create_jobs)(struct hl_device *hdev, struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id, u32 collective_engine_id); + u64 (*scramble_addr)(struct hl_device *hdev, u64 addr); + u64 (*descramble_addr)(struct hl_device *hdev, u64 addr); + void (*ack_protection_bits_errors)(struct hl_device *hdev); + int (*get_hw_block_id)(struct hl_device *hdev, u64 block_addr, + u32 *block_size, u32 *block_id); + int (*hw_block_mmap)(struct hl_device *hdev, struct vm_area_struct *vma, + u32 block_id, u32 block_size); + void (*enable_events_from_fw)(struct hl_device *hdev); }; @@ -1012,6 +1043,20 @@ struct hl_cs_counters_atomic { }; /** + * struct hl_pending_cb - pending command buffer structure + * @cb_node: cb node in pending cb list + * @cb: command buffer to send in next submission + * @cb_size: command buffer size + * @hw_queue_id: destination queue id + */ +struct hl_pending_cb { + struct list_head cb_node; + struct hl_cb *cb; + u32 cb_size; + u32 hw_queue_id; +}; + +/** * struct hl_ctx - user/kernel context. * @mem_hash: holds mapping from virtual address to virtual memory area * descriptor (hl_vm_phys_pg_list or hl_userptr). @@ -1026,6 +1071,8 @@ struct hl_cs_counters_atomic { * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the * MMU hash or walking the PGT requires talking this lock. * @debugfs_list: node in debugfs list of contexts. + * pending_cb_list: list of pending command buffers waiting to be sent upon + * next user command submission context. * @cs_counters: context command submission counters. * @cb_va_pool: device VA pool for command buffers which are mapped to the * device's MMU. @@ -1034,11 +1081,17 @@ struct hl_cs_counters_atomic { * index to cs_pending array. * @dram_default_hops: array that holds all hops addresses needed for default * DRAM mapping. + * @pending_cb_lock: spinlock to protect pending cb list * @cs_lock: spinlock to protect cs_sequence. * @dram_phys_mem: amount of used physical DRAM memory by this context. * @thread_ctx_switch_token: token to prevent multiple threads of the same * context from running the context switch phase. * Only a single thread should run it. + * @thread_pending_cb_token: token to prevent multiple threads from processing + * the pending CB list. Only a single thread should + * process the list since it is protected by a + * spinlock and we don't want to halt the entire + * command submission sequence. * @thread_ctx_switch_wait_token: token to prevent the threads that didn't run * the context switch phase from moving to their * execution phase before the context switch phase @@ -1057,13 +1110,16 @@ struct hl_ctx { struct mutex mem_hash_lock; struct mutex mmu_lock; struct list_head debugfs_list; + struct list_head pending_cb_list; struct hl_cs_counters_atomic cs_counters; struct gen_pool *cb_va_pool; u64 cs_sequence; u64 *dram_default_hops; + spinlock_t pending_cb_lock; spinlock_t cs_lock; atomic64_t dram_phys_mem; atomic_t thread_ctx_switch_token; + atomic_t thread_pending_cb_token; u32 thread_ctx_switch_wait_token; u32 asid; u32 handle; @@ -1124,8 +1180,11 @@ struct hl_userptr { * @finish_work: workqueue object to run when CS is completed by H/W. * @work_tdr: delayed work node for TDR. * @mirror_node : node in device mirror list of command submissions. + * @staged_cs_node: node in the staged cs list. * @debugfs_list: node in debugfs list of command submissions. * @sequence: the sequence number of this CS. + * @staged_sequence: the sequence of the staged submission this CS is part of, + * relevant only if staged_cs is set. * @type: CS_TYPE_*. * @submitted: true if CS was submitted to H/W. * @completed: true if CS was completed by device. @@ -1133,7 +1192,11 @@ struct hl_userptr { * @tdr_active: true if TDR was activated for this CS (to prevent * double TDR activation). * @aborted: true if CS was aborted due to some device error. - * @timestamp: true if a timestmap must be captured upon completion + * @timestamp: true if a timestmap must be captured upon completion. + * @staged_last: true if this is the last staged CS and needs completion. + * @staged_first: true if this is the first staged CS and we need to receive + * timeout for this CS. + * @staged_cs: true if this CS is part of a staged submission. */ struct hl_cs { u16 *jobs_in_queue_cnt; @@ -1146,8 +1209,10 @@ struct hl_cs { struct work_struct finish_work; struct delayed_work work_tdr; struct list_head mirror_node; + struct list_head staged_cs_node; struct list_head debugfs_list; u64 sequence; + u64 staged_sequence; enum hl_cs_type type; u8 submitted; u8 completed; @@ -1155,6 +1220,9 @@ struct hl_cs { u8 tdr_active; u8 aborted; u8 timestamp; + u8 staged_last; + u8 staged_first; + u8 staged_cs; }; /** @@ -1225,6 +1293,7 @@ struct hl_cs_job { * MSG_PROT packets. Relevant only for GAUDI as GOYA doesn't * have streams so the engine can't be busy by another * stream. + * @completion: true if we need completion for this CS. */ struct hl_cs_parser { struct hl_cb *user_cb; @@ -1239,6 +1308,7 @@ struct hl_cs_parser { u8 job_id; u8 is_kernel_allocated_cb; u8 contains_dma_pkt; + u8 completion; }; /* @@ -1688,12 +1758,20 @@ struct hl_mmu_per_hop_info { * struct hl_mmu_hop_info - A structure describing the TLB hops and their * hop-entries that were created in order to translate a virtual address to a * physical one. + * @scrambled_vaddr: The value of the virtual address after scrambling. This + * address replaces the original virtual-address when mapped + * in the MMU tables. + * @unscrambled_paddr: The un-scrambled physical address. * @hop_info: Array holding the per-hop information used for the translation. * @used_hops: The number of hops used for the translation. + * @range_type: virtual address range type. */ struct hl_mmu_hop_info { + u64 scrambled_vaddr; + u64 unscrambled_paddr; struct hl_mmu_per_hop_info hop_info[MMU_ARCH_5_HOPS]; u32 used_hops; + enum hl_va_range_type range_type; }; /** @@ -1766,7 +1844,6 @@ struct hl_mmu_funcs { * @asic_funcs: ASIC specific functions. * @asic_specific: ASIC specific information to use only from ASIC files. * @vm: virtual memory manager for MMU. - * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context. * @hwmon_dev: H/W monitor device. * @pm_mng_profile: current power management profile. * @hl_chip_info: ASIC's sensors information. @@ -1844,6 +1921,7 @@ struct hl_mmu_funcs { * user processes * @device_fini_pending: true if device_fini was called and might be * waiting for the reset thread to finish + * @supports_staged_submission: true if staged submissions are supported */ struct hl_device { struct pci_dev *pdev; @@ -1881,7 +1959,6 @@ struct hl_device { const struct hl_asic_funcs *asic_funcs; void *asic_specific; struct hl_vm vm; - struct mutex mmu_cache_lock; struct device *hwmon_dev; enum hl_pm_mng_profile pm_mng_profile; struct hwmon_chip_info *hl_chip_info; @@ -1950,6 +2027,7 @@ struct hl_device { u8 needs_reset; u8 process_kill_trial_cnt; u8 device_fini_pending; + u8 supports_staged_submission; /* Parameters for bring-up */ u64 nic_ports_mask; @@ -2067,7 +2145,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id, int hl_hw_queue_schedule_cs(struct hl_cs *cs); u32 hl_hw_queue_add_ptr(u32 ptr, u16 val); void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id); -void hl_int_hw_queue_update_ci(struct hl_cs *cs); +void hl_hw_queue_update_ci(struct hl_cs *cs); void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset); #define hl_queue_inc_ptr(p) hl_hw_queue_add_ptr(p, 1) @@ -2123,6 +2201,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, bool map_cb, u64 *handle); int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle); int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma); +int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma); struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 handle); void hl_cb_put(struct hl_cb *cb); @@ -2136,6 +2215,7 @@ int hl_cb_va_pool_init(struct hl_ctx *ctx); void hl_cb_va_pool_fini(struct hl_ctx *ctx); void hl_cs_rollback_all(struct hl_device *hdev); +void hl_pending_cb_list_flush(struct hl_ctx *ctx); struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, enum hl_queue_type queue_type, bool is_kernel_allocated_cb); void hl_sob_reset_error(struct kref *ref); @@ -2143,6 +2223,10 @@ int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask); void hl_fence_put(struct hl_fence *fence); void hl_fence_get(struct hl_fence *fence); void cs_get(struct hl_cs *cs); +bool cs_needs_completion(struct hl_cs *cs); +bool cs_needs_timeout(struct hl_cs *cs); +bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs); +struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq); void goya_set_asic_funcs(struct hl_device *hdev); void gaudi_set_asic_funcs(struct hl_device *hdev); @@ -2184,6 +2268,8 @@ void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu); int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr); int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr, struct hl_mmu_hop_info *hops); +u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr); +u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr); bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr); int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name, @@ -2201,7 +2287,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size, void *vaddr); int hl_fw_send_heartbeat(struct hl_device *hdev); int hl_fw_cpucp_info_get(struct hl_device *hdev, - u32 cpu_security_boot_status_reg); + u32 cpu_security_boot_status_reg, + u32 boot_err0_reg); int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size); int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev, struct hl_info_pci_counters *counters); |