summaryrefslogtreecommitdiffstats
path: root/drivers/misc/habanalabs/common/habanalabs.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs/common/habanalabs.h')
-rw-r--r--drivers/misc/habanalabs/common/habanalabs.h109
1 files changed, 98 insertions, 11 deletions
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 41af347157e0..d933878b24d1 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -28,17 +28,18 @@
#define HL_NAME "habanalabs"
/* Use upper bits of mmap offset to store habana driver specific information.
- * bits[63:62] - Encode mmap type
+ * bits[63:61] - Encode mmap type
* bits[45:0] - mmap offset value
*
* NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
* defines are w.r.t to PAGE_SIZE
*/
-#define HL_MMAP_TYPE_SHIFT (62 - PAGE_SHIFT)
-#define HL_MMAP_TYPE_MASK (0x3ull << HL_MMAP_TYPE_SHIFT)
+#define HL_MMAP_TYPE_SHIFT (61 - PAGE_SHIFT)
+#define HL_MMAP_TYPE_MASK (0x7ull << HL_MMAP_TYPE_SHIFT)
+#define HL_MMAP_TYPE_BLOCK (0x4ull << HL_MMAP_TYPE_SHIFT)
#define HL_MMAP_TYPE_CB (0x2ull << HL_MMAP_TYPE_SHIFT)
-#define HL_MMAP_OFFSET_VALUE_MASK (0x3FFFFFFFFFFFull >> PAGE_SHIFT)
+#define HL_MMAP_OFFSET_VALUE_MASK (0x1FFFFFFFFFFFull >> PAGE_SHIFT)
#define HL_MMAP_OFFSET_VALUE_GET(off) (off & HL_MMAP_OFFSET_VALUE_MASK)
#define HL_PENDING_RESET_PER_SEC 10
@@ -408,6 +409,9 @@ struct hl_mmu_properties {
* @sync_stream_first_mon: first monitor available for sync stream use
* @first_available_user_sob: first sob available for the user
* @first_available_user_mon: first monitor available for the user
+ * @first_available_user_msix_interrupt: first available msix interrupt
+ * reserved for the user
+ * @first_available_cq: first available CQ for the user.
* @tpc_enabled_mask: which TPCs are enabled.
* @completion_queues_count: number of completion queues.
* @fw_security_disabled: true if security measures are disabled in firmware,
@@ -416,6 +420,7 @@ struct hl_mmu_properties {
* from BOOT_DEV_STS0
* @dram_supports_virtual_memory: is there an MMU towards the DRAM
* @hard_reset_done_by_fw: true if firmware is handling hard reset flow
+ * @num_functional_hbms: number of functional HBMs in each DCORE.
*/
struct asic_fixed_properties {
struct hw_queue_properties *hw_queues_props;
@@ -468,18 +473,22 @@ struct asic_fixed_properties {
u16 sync_stream_first_mon;
u16 first_available_user_sob[HL_MAX_DCORES];
u16 first_available_user_mon[HL_MAX_DCORES];
+ u16 first_available_user_msix_interrupt;
+ u16 first_available_cq[HL_MAX_DCORES];
u8 tpc_enabled_mask;
u8 completion_queues_count;
u8 fw_security_disabled;
u8 fw_security_status_valid;
u8 dram_supports_virtual_memory;
u8 hard_reset_done_by_fw;
+ u8 num_functional_hbms;
};
/**
* struct hl_fence - software synchronization primitive
* @completion: fence is implemented using completion
* @refcount: refcount for this fence
+ * @cs_sequence: sequence of the corresponding command submission
* @error: mark this fence with error
* @timestamp: timestamp upon completion
*
@@ -487,6 +496,7 @@ struct asic_fixed_properties {
struct hl_fence {
struct completion completion;
struct kref refcount;
+ u64 cs_sequence;
int error;
ktime_t timestamp;
};
@@ -846,6 +856,19 @@ enum div_select_defs {
* @collective_wait_init_cs: Generate collective master/slave packets
* and place them in the relevant cs jobs
* @collective_wait_create_jobs: allocate collective wait cs jobs
+ * @scramble_addr: Routine to scramble the address prior of mapping it
+ * in the MMU.
+ * @descramble_addr: Routine to de-scramble the address prior of
+ * showing it to users.
+ * @ack_protection_bits_errors: ack and dump all security violations
+ * @get_hw_block_id: retrieve a HW block id to be used by the user to mmap it.
+ * also returns the size of the block if caller supplies
+ * a valid pointer for it
+ * @hw_block_mmap: mmap a HW block with a given id.
+ * @enable_events_from_fw: send interrupt to firmware to notify them the
+ * driver is ready to receive asynchronous events. This
+ * function should be called during the first init and
+ * after every hard-reset of the device
*/
struct hl_asic_funcs {
int (*early_init)(struct hl_device *hdev);
@@ -918,8 +941,8 @@ struct hl_asic_funcs {
void (*set_clock_gating)(struct hl_device *hdev);
void (*disable_clock_gating)(struct hl_device *hdev);
int (*debug_coresight)(struct hl_device *hdev, void *data);
- bool (*is_device_idle)(struct hl_device *hdev, u64 *mask,
- struct seq_file *s);
+ bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr,
+ u8 mask_len, struct seq_file *s);
int (*soft_reset_late_init)(struct hl_device *hdev);
void (*hw_queues_lock)(struct hl_device *hdev);
void (*hw_queues_unlock)(struct hl_device *hdev);
@@ -955,6 +978,14 @@ struct hl_asic_funcs {
int (*collective_wait_create_jobs)(struct hl_device *hdev,
struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
u32 collective_engine_id);
+ u64 (*scramble_addr)(struct hl_device *hdev, u64 addr);
+ u64 (*descramble_addr)(struct hl_device *hdev, u64 addr);
+ void (*ack_protection_bits_errors)(struct hl_device *hdev);
+ int (*get_hw_block_id)(struct hl_device *hdev, u64 block_addr,
+ u32 *block_size, u32 *block_id);
+ int (*hw_block_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
+ u32 block_id, u32 block_size);
+ void (*enable_events_from_fw)(struct hl_device *hdev);
};
@@ -1012,6 +1043,20 @@ struct hl_cs_counters_atomic {
};
/**
+ * struct hl_pending_cb - pending command buffer structure
+ * @cb_node: cb node in pending cb list
+ * @cb: command buffer to send in next submission
+ * @cb_size: command buffer size
+ * @hw_queue_id: destination queue id
+ */
+struct hl_pending_cb {
+ struct list_head cb_node;
+ struct hl_cb *cb;
+ u32 cb_size;
+ u32 hw_queue_id;
+};
+
+/**
* struct hl_ctx - user/kernel context.
* @mem_hash: holds mapping from virtual address to virtual memory area
* descriptor (hl_vm_phys_pg_list or hl_userptr).
@@ -1026,6 +1071,8 @@ struct hl_cs_counters_atomic {
* @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
* MMU hash or walking the PGT requires talking this lock.
* @debugfs_list: node in debugfs list of contexts.
+ * pending_cb_list: list of pending command buffers waiting to be sent upon
+ * next user command submission context.
* @cs_counters: context command submission counters.
* @cb_va_pool: device VA pool for command buffers which are mapped to the
* device's MMU.
@@ -1034,11 +1081,17 @@ struct hl_cs_counters_atomic {
* index to cs_pending array.
* @dram_default_hops: array that holds all hops addresses needed for default
* DRAM mapping.
+ * @pending_cb_lock: spinlock to protect pending cb list
* @cs_lock: spinlock to protect cs_sequence.
* @dram_phys_mem: amount of used physical DRAM memory by this context.
* @thread_ctx_switch_token: token to prevent multiple threads of the same
* context from running the context switch phase.
* Only a single thread should run it.
+ * @thread_pending_cb_token: token to prevent multiple threads from processing
+ * the pending CB list. Only a single thread should
+ * process the list since it is protected by a
+ * spinlock and we don't want to halt the entire
+ * command submission sequence.
* @thread_ctx_switch_wait_token: token to prevent the threads that didn't run
* the context switch phase from moving to their
* execution phase before the context switch phase
@@ -1057,13 +1110,16 @@ struct hl_ctx {
struct mutex mem_hash_lock;
struct mutex mmu_lock;
struct list_head debugfs_list;
+ struct list_head pending_cb_list;
struct hl_cs_counters_atomic cs_counters;
struct gen_pool *cb_va_pool;
u64 cs_sequence;
u64 *dram_default_hops;
+ spinlock_t pending_cb_lock;
spinlock_t cs_lock;
atomic64_t dram_phys_mem;
atomic_t thread_ctx_switch_token;
+ atomic_t thread_pending_cb_token;
u32 thread_ctx_switch_wait_token;
u32 asid;
u32 handle;
@@ -1124,8 +1180,11 @@ struct hl_userptr {
* @finish_work: workqueue object to run when CS is completed by H/W.
* @work_tdr: delayed work node for TDR.
* @mirror_node : node in device mirror list of command submissions.
+ * @staged_cs_node: node in the staged cs list.
* @debugfs_list: node in debugfs list of command submissions.
* @sequence: the sequence number of this CS.
+ * @staged_sequence: the sequence of the staged submission this CS is part of,
+ * relevant only if staged_cs is set.
* @type: CS_TYPE_*.
* @submitted: true if CS was submitted to H/W.
* @completed: true if CS was completed by device.
@@ -1133,7 +1192,11 @@ struct hl_userptr {
* @tdr_active: true if TDR was activated for this CS (to prevent
* double TDR activation).
* @aborted: true if CS was aborted due to some device error.
- * @timestamp: true if a timestmap must be captured upon completion
+ * @timestamp: true if a timestmap must be captured upon completion.
+ * @staged_last: true if this is the last staged CS and needs completion.
+ * @staged_first: true if this is the first staged CS and we need to receive
+ * timeout for this CS.
+ * @staged_cs: true if this CS is part of a staged submission.
*/
struct hl_cs {
u16 *jobs_in_queue_cnt;
@@ -1146,8 +1209,10 @@ struct hl_cs {
struct work_struct finish_work;
struct delayed_work work_tdr;
struct list_head mirror_node;
+ struct list_head staged_cs_node;
struct list_head debugfs_list;
u64 sequence;
+ u64 staged_sequence;
enum hl_cs_type type;
u8 submitted;
u8 completed;
@@ -1155,6 +1220,9 @@ struct hl_cs {
u8 tdr_active;
u8 aborted;
u8 timestamp;
+ u8 staged_last;
+ u8 staged_first;
+ u8 staged_cs;
};
/**
@@ -1225,6 +1293,7 @@ struct hl_cs_job {
* MSG_PROT packets. Relevant only for GAUDI as GOYA doesn't
* have streams so the engine can't be busy by another
* stream.
+ * @completion: true if we need completion for this CS.
*/
struct hl_cs_parser {
struct hl_cb *user_cb;
@@ -1239,6 +1308,7 @@ struct hl_cs_parser {
u8 job_id;
u8 is_kernel_allocated_cb;
u8 contains_dma_pkt;
+ u8 completion;
};
/*
@@ -1688,12 +1758,20 @@ struct hl_mmu_per_hop_info {
* struct hl_mmu_hop_info - A structure describing the TLB hops and their
* hop-entries that were created in order to translate a virtual address to a
* physical one.
+ * @scrambled_vaddr: The value of the virtual address after scrambling. This
+ * address replaces the original virtual-address when mapped
+ * in the MMU tables.
+ * @unscrambled_paddr: The un-scrambled physical address.
* @hop_info: Array holding the per-hop information used for the translation.
* @used_hops: The number of hops used for the translation.
+ * @range_type: virtual address range type.
*/
struct hl_mmu_hop_info {
+ u64 scrambled_vaddr;
+ u64 unscrambled_paddr;
struct hl_mmu_per_hop_info hop_info[MMU_ARCH_5_HOPS];
u32 used_hops;
+ enum hl_va_range_type range_type;
};
/**
@@ -1766,7 +1844,6 @@ struct hl_mmu_funcs {
* @asic_funcs: ASIC specific functions.
* @asic_specific: ASIC specific information to use only from ASIC files.
* @vm: virtual memory manager for MMU.
- * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context.
* @hwmon_dev: H/W monitor device.
* @pm_mng_profile: current power management profile.
* @hl_chip_info: ASIC's sensors information.
@@ -1844,6 +1921,7 @@ struct hl_mmu_funcs {
* user processes
* @device_fini_pending: true if device_fini was called and might be
* waiting for the reset thread to finish
+ * @supports_staged_submission: true if staged submissions are supported
*/
struct hl_device {
struct pci_dev *pdev;
@@ -1881,7 +1959,6 @@ struct hl_device {
const struct hl_asic_funcs *asic_funcs;
void *asic_specific;
struct hl_vm vm;
- struct mutex mmu_cache_lock;
struct device *hwmon_dev;
enum hl_pm_mng_profile pm_mng_profile;
struct hwmon_chip_info *hl_chip_info;
@@ -1950,6 +2027,7 @@ struct hl_device {
u8 needs_reset;
u8 process_kill_trial_cnt;
u8 device_fini_pending;
+ u8 supports_staged_submission;
/* Parameters for bring-up */
u64 nic_ports_mask;
@@ -2067,7 +2145,7 @@ int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
int hl_hw_queue_schedule_cs(struct hl_cs *cs);
u32 hl_hw_queue_add_ptr(u32 ptr, u16 val);
void hl_hw_queue_inc_ci_kernel(struct hl_device *hdev, u32 hw_queue_id);
-void hl_int_hw_queue_update_ci(struct hl_cs *cs);
+void hl_hw_queue_update_ci(struct hl_cs *cs);
void hl_hw_queue_reset(struct hl_device *hdev, bool hard_reset);
#define hl_queue_inc_ptr(p) hl_hw_queue_add_ptr(p, 1)
@@ -2123,6 +2201,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
bool map_cb, u64 *handle);
int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
+int hl_hw_block_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
struct hl_cb *hl_cb_get(struct hl_device *hdev, struct hl_cb_mgr *mgr,
u32 handle);
void hl_cb_put(struct hl_cb *cb);
@@ -2136,6 +2215,7 @@ int hl_cb_va_pool_init(struct hl_ctx *ctx);
void hl_cb_va_pool_fini(struct hl_ctx *ctx);
void hl_cs_rollback_all(struct hl_device *hdev);
+void hl_pending_cb_list_flush(struct hl_ctx *ctx);
struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
void hl_sob_reset_error(struct kref *ref);
@@ -2143,6 +2223,10 @@ int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask);
void hl_fence_put(struct hl_fence *fence);
void hl_fence_get(struct hl_fence *fence);
void cs_get(struct hl_cs *cs);
+bool cs_needs_completion(struct hl_cs *cs);
+bool cs_needs_timeout(struct hl_cs *cs);
+bool is_staged_cs_last_exists(struct hl_device *hdev, struct hl_cs *cs);
+struct hl_cs *hl_staged_cs_find_first(struct hl_device *hdev, u64 cs_seq);
void goya_set_asic_funcs(struct hl_device *hdev);
void gaudi_set_asic_funcs(struct hl_device *hdev);
@@ -2184,6 +2268,8 @@ void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
struct hl_mmu_hop_info *hops);
+u64 hl_mmu_scramble_addr(struct hl_device *hdev, u64 addr);
+u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr);
bool hl_is_dram_va(struct hl_device *hdev, u64 virt_addr);
int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
@@ -2201,7 +2287,8 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
void *vaddr);
int hl_fw_send_heartbeat(struct hl_device *hdev);
int hl_fw_cpucp_info_get(struct hl_device *hdev,
- u32 cpu_security_boot_status_reg);
+ u32 cpu_security_boot_status_reg,
+ u32 boot_err0_reg);
int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
struct hl_info_pci_counters *counters);