Kernels
danieldk HF Staff commited on
Commit
645c13c
·
verified ·
1 Parent(s): 02f588e

Build uploaded using `kernels`.

Browse files
Files changed (43) hide show
  1. build/torch210-cxx11-cpu-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cpu_9f0ed09.abi3.so} +2 -2
  2. build/torch210-cxx11-cpu-x86_64-linux/_ops.py +3 -3
  3. build/torch210-cxx11-cpu-x86_64-linux/flash_attn_interface.py +6 -8
  4. build/torch210-cxx11-cpu-x86_64-linux/metadata.json +1 -0
  5. build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cuda_9f0ed09.abi3.so} +2 -2
  6. build/torch210-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  7. build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py +6 -8
  8. build/torch210-cxx11-cu126-x86_64-linux/metadata.json +10 -2
  9. build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cuda_9f0ed09.abi3.so} +2 -2
  10. build/torch210-cxx11-cu128-x86_64-linux/_ops.py +3 -3
  11. build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py +6 -8
  12. build/torch210-cxx11-cu128-x86_64-linux/metadata.json +12 -2
  13. build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cuda_9f0ed09.abi3.so} +2 -2
  14. build/torch210-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  15. build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py +6 -8
  16. build/torch210-cxx11-cu130-x86_64-linux/metadata.json +12 -2
  17. build/{torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_660971e.abi3.so → torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so} +2 -2
  18. build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py +3 -3
  19. build/torch210-cxx11-xpu20253-x86_64-linux/flash_attn_interface.py +3 -5
  20. build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json +1 -0
  21. build/torch29-cxx11-cpu-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cpu_9f0ed09.abi3.so} +2 -2
  22. build/torch29-cxx11-cpu-x86_64-linux/_ops.py +3 -3
  23. build/torch29-cxx11-cpu-x86_64-linux/flash_attn_interface.py +6 -8
  24. build/torch29-cxx11-cpu-x86_64-linux/metadata.json +1 -0
  25. build/torch29-cxx11-cu126-x86_64-linux/_flash_attn2_588b404.abi3.so +0 -3
  26. build/torch29-cxx11-cu126-x86_64-linux/_flash_attn2_cuda_9f0ed09.abi3.so +3 -0
  27. build/torch29-cxx11-cu126-x86_64-linux/_ops.py +3 -3
  28. build/torch29-cxx11-cu126-x86_64-linux/flash_attn_interface.py +6 -8
  29. build/torch29-cxx11-cu126-x86_64-linux/metadata.json +10 -2
  30. build/torch29-cxx11-cu128-x86_64-linux/_flash_attn2_588b404.abi3.so +0 -3
  31. build/torch29-cxx11-cu128-x86_64-linux/_flash_attn2_cuda_9f0ed09.abi3.so +3 -0
  32. build/torch29-cxx11-cu128-x86_64-linux/_ops.py +3 -3
  33. build/torch29-cxx11-cu128-x86_64-linux/flash_attn_interface.py +6 -8
  34. build/torch29-cxx11-cu128-x86_64-linux/metadata.json +12 -2
  35. build/torch29-cxx11-cu130-x86_64-linux/_flash_attn2_588b404.abi3.so +0 -3
  36. build/torch29-cxx11-cu130-x86_64-linux/_flash_attn2_cuda_9f0ed09.abi3.so +3 -0
  37. build/torch29-cxx11-cu130-x86_64-linux/_ops.py +3 -3
  38. build/torch29-cxx11-cu130-x86_64-linux/flash_attn_interface.py +6 -8
  39. build/torch29-cxx11-cu130-x86_64-linux/metadata.json +12 -2
  40. build/{torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_660971e.abi3.so → torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so} +2 -2
  41. build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py +3 -3
  42. build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn_interface.py +3 -5
  43. build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json +1 -0
build/torch210-cxx11-cpu-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cpu_9f0ed09.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d90d30dbcf574c7a50f2c9774884370e71e1e177062c6a233fcc7e1940fffcb
3
- size 249504
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45757a80c809dcbf1a8c75bde0c42dcba171960901f1c085699036d908a81c20
3
+ size 1942496
build/torch210-cxx11-cpu-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_588b404
3
- ops = torch.ops._flash_attn2_588b404
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_588b404::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cpu_9f0ed09
3
+ ops = torch.ops._flash_attn2_cpu_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cpu_9f0ed09::{op_name}"
build/torch210-cxx11-cpu-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1064,7 +1062,7 @@ def flash_attn_qkvpacked_func(
1064
  alibi_slopes,
1065
  deterministic,
1066
  return_attn_probs,
1067
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1068
  )
1069
 
1070
 
@@ -1142,7 +1140,7 @@ def flash_attn_kvpacked_func(
1142
  alibi_slopes,
1143
  deterministic,
1144
  return_attn_probs,
1145
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1146
  )
1147
 
1148
 
@@ -1219,7 +1217,7 @@ def flash_attn_func(
1219
  alibi_slopes,
1220
  deterministic,
1221
  return_attn_probs,
1222
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1223
  )
1224
 
1225
 
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1062
  alibi_slopes,
1063
  deterministic,
1064
  return_attn_probs,
1065
+ torch.is_grad_enabled(),
1066
  )
1067
 
1068
 
 
1140
  alibi_slopes,
1141
  deterministic,
1142
  return_attn_probs,
1143
+ torch.is_grad_enabled(),
1144
  )
1145
 
1146
 
 
1217
  alibi_slopes,
1218
  deterministic,
1219
  return_attn_probs,
1220
+ torch.is_grad_enabled(),
1221
  )
1222
 
1223
 
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch210-cxx11-cpu-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,5 @@
1
  {
2
  "version": 1,
 
3
  "python-depends": []
4
  }
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
  "python-depends": []
5
  }
build/torch210-cxx11-cu126-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cuda_9f0ed09.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:247ade2063814573447dcb697fd39e738bcf5f0f5d40ac87eaf6cf6dba29298f
3
- size 448708992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:039e336f68c3efaa02ef0b103a2607b2a544d68a46fa0165e2433464f26223a3
3
+ size 448709016
build/torch210-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_588b404
3
- ops = torch.ops._flash_attn2_588b404
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_588b404::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_9f0ed09
3
+ ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_9f0ed09::{op_name}"
build/torch210-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1064,7 +1062,7 @@ def flash_attn_qkvpacked_func(
1064
  alibi_slopes,
1065
  deterministic,
1066
  return_attn_probs,
1067
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1068
  )
1069
 
1070
 
@@ -1142,7 +1140,7 @@ def flash_attn_kvpacked_func(
1142
  alibi_slopes,
1143
  deterministic,
1144
  return_attn_probs,
1145
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1146
  )
1147
 
1148
 
@@ -1219,7 +1217,7 @@ def flash_attn_func(
1219
  alibi_slopes,
1220
  deterministic,
1221
  return_attn_probs,
1222
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1223
  )
1224
 
1225
 
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1062
  alibi_slopes,
1063
  deterministic,
1064
  return_attn_probs,
1065
+ torch.is_grad_enabled(),
1066
  )
1067
 
1068
 
 
1140
  alibi_slopes,
1141
  deterministic,
1142
  return_attn_probs,
1143
+ torch.is_grad_enabled(),
1144
  )
1145
 
1146
 
 
1217
  alibi_slopes,
1218
  deterministic,
1219
  return_attn_probs,
1220
+ torch.is_grad_enabled(),
1221
  )
1222
 
1223
 
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch210-cxx11-cu126-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,12 @@
1
  {
2
  "version": 1,
3
- "python-depends": []
4
- }
 
 
 
 
 
 
 
 
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "8.0",
9
+ "9.0"
10
+ ]
11
+ }
12
+ }
build/torch210-cxx11-cu128-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cuda_9f0ed09.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:09cfe096dc8f0010e99225d44263e4d9172d4b542d48d656b3b9fd718ca55b7d
3
- size 1037803376
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac175faf91cfb9cd9827985bae32380035cb9f880f0bbb702e7f045eee90ae0a
3
+ size 1037803408
build/torch210-cxx11-cu128-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_588b404
3
- ops = torch.ops._flash_attn2_588b404
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_588b404::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_9f0ed09
3
+ ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_9f0ed09::{op_name}"
build/torch210-cxx11-cu128-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1064,7 +1062,7 @@ def flash_attn_qkvpacked_func(
1064
  alibi_slopes,
1065
  deterministic,
1066
  return_attn_probs,
1067
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1068
  )
1069
 
1070
 
@@ -1142,7 +1140,7 @@ def flash_attn_kvpacked_func(
1142
  alibi_slopes,
1143
  deterministic,
1144
  return_attn_probs,
1145
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1146
  )
1147
 
1148
 
@@ -1219,7 +1217,7 @@ def flash_attn_func(
1219
  alibi_slopes,
1220
  deterministic,
1221
  return_attn_probs,
1222
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1223
  )
1224
 
1225
 
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1062
  alibi_slopes,
1063
  deterministic,
1064
  return_attn_probs,
1065
+ torch.is_grad_enabled(),
1066
  )
1067
 
1068
 
 
1140
  alibi_slopes,
1141
  deterministic,
1142
  return_attn_probs,
1143
+ torch.is_grad_enabled(),
1144
  )
1145
 
1146
 
 
1217
  alibi_slopes,
1218
  deterministic,
1219
  return_attn_probs,
1220
+ torch.is_grad_enabled(),
1221
  )
1222
 
1223
 
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch210-cxx11-cu128-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,14 @@
1
  {
2
  "version": 1,
3
- "python-depends": []
4
- }
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "10.0",
9
+ "12.0",
10
+ "8.0",
11
+ "9.0"
12
+ ]
13
+ }
14
+ }
build/torch210-cxx11-cu130-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cuda_9f0ed09.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:196d3756a7d099f5e23ddd53ebc47aadf558a96e1d7873f5a14faec09bb7b707
3
- size 1009055064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e67e00b79ea3625b0ec32a083544d1808cce682dfe593a7212c525a292d1764f
3
+ size 1009055088
build/torch210-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_588b404
3
- ops = torch.ops._flash_attn2_588b404
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_588b404::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_9f0ed09
3
+ ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_9f0ed09::{op_name}"
build/torch210-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1064,7 +1062,7 @@ def flash_attn_qkvpacked_func(
1064
  alibi_slopes,
1065
  deterministic,
1066
  return_attn_probs,
1067
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1068
  )
1069
 
1070
 
@@ -1142,7 +1140,7 @@ def flash_attn_kvpacked_func(
1142
  alibi_slopes,
1143
  deterministic,
1144
  return_attn_probs,
1145
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1146
  )
1147
 
1148
 
@@ -1219,7 +1217,7 @@ def flash_attn_func(
1219
  alibi_slopes,
1220
  deterministic,
1221
  return_attn_probs,
1222
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1223
  )
1224
 
1225
 
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1062
  alibi_slopes,
1063
  deterministic,
1064
  return_attn_probs,
1065
+ torch.is_grad_enabled(),
1066
  )
1067
 
1068
 
 
1140
  alibi_slopes,
1141
  deterministic,
1142
  return_attn_probs,
1143
+ torch.is_grad_enabled(),
1144
  )
1145
 
1146
 
 
1217
  alibi_slopes,
1218
  deterministic,
1219
  return_attn_probs,
1220
+ torch.is_grad_enabled(),
1221
  )
1222
 
1223
 
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch210-cxx11-cu130-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,14 @@
1
  {
2
  "version": 1,
3
- "python-depends": []
4
- }
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "10.0",
9
+ "12.0",
10
+ "8.0",
11
+ "9.0"
12
+ ]
13
+ }
14
+ }
build/{torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_660971e.abi3.so → torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7d2c3b732c7af702975d491dd792485a9dc9a68f41d8f6cdd9fca64045dd27b6
3
- size 13907792
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:192cfb14df060c1fd913c7a6e5a588da44dec07d0a9adf156ed377d860c8d3c2
3
+ size 15436096
build/torch210-cxx11-xpu20253-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_660971e
3
- ops = torch.ops._flash_attn2_660971e
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_660971e::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_xpu_9f0ed09
3
+ ops = torch.ops._flash_attn2_xpu_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_xpu_9f0ed09::{op_name}"
build/torch210-cxx11-xpu20253-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch210-cxx11-xpu20253-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,5 @@
1
  {
2
  "version": 1,
 
3
  "python-depends": []
4
  }
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
  "python-depends": []
5
  }
build/torch29-cxx11-cpu-x86_64-linux/{_flash_attn2_588b404.abi3.so → _flash_attn2_cpu_9f0ed09.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dfab2b6517d74cf640d3da51f60a787dc42fa98774e10bf52d0a265cc5423f53
3
- size 239416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7313920d802d0477ca3d4144bc3b11e3c4761ea0e42b55a9b1b0b05567d23f71
3
+ size 1932200
build/torch29-cxx11-cpu-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_588b404
3
- ops = torch.ops._flash_attn2_588b404
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_588b404::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cpu_9f0ed09
3
+ ops = torch.ops._flash_attn2_cpu_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cpu_9f0ed09::{op_name}"
build/torch29-cxx11-cpu-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1064,7 +1062,7 @@ def flash_attn_qkvpacked_func(
1064
  alibi_slopes,
1065
  deterministic,
1066
  return_attn_probs,
1067
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1068
  )
1069
 
1070
 
@@ -1142,7 +1140,7 @@ def flash_attn_kvpacked_func(
1142
  alibi_slopes,
1143
  deterministic,
1144
  return_attn_probs,
1145
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1146
  )
1147
 
1148
 
@@ -1219,7 +1217,7 @@ def flash_attn_func(
1219
  alibi_slopes,
1220
  deterministic,
1221
  return_attn_probs,
1222
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1223
  )
1224
 
1225
 
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1062
  alibi_slopes,
1063
  deterministic,
1064
  return_attn_probs,
1065
+ torch.is_grad_enabled(),
1066
  )
1067
 
1068
 
 
1140
  alibi_slopes,
1141
  deterministic,
1142
  return_attn_probs,
1143
+ torch.is_grad_enabled(),
1144
  )
1145
 
1146
 
 
1217
  alibi_slopes,
1218
  deterministic,
1219
  return_attn_probs,
1220
+ torch.is_grad_enabled(),
1221
  )
1222
 
1223
 
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch29-cxx11-cpu-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,5 @@
1
  {
2
  "version": 1,
 
3
  "python-depends": []
4
  }
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
  "python-depends": []
5
  }
build/torch29-cxx11-cu126-x86_64-linux/_flash_attn2_588b404.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:82a33a975de0a2c8e2440d596ecde21e4f3e1e8dcc9df42843e2045edb1e6d47
3
- size 448648728
 
 
 
 
build/torch29-cxx11-cu126-x86_64-linux/_flash_attn2_cuda_9f0ed09.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d422421456bf5ac34486ee898a7a6aaea7fff2edda3bce062d0283f69806275
3
+ size 448648752
build/torch29-cxx11-cu126-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_588b404
3
- ops = torch.ops._flash_attn2_588b404
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_588b404::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_9f0ed09
3
+ ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_9f0ed09::{op_name}"
build/torch29-cxx11-cu126-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1064,7 +1062,7 @@ def flash_attn_qkvpacked_func(
1064
  alibi_slopes,
1065
  deterministic,
1066
  return_attn_probs,
1067
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1068
  )
1069
 
1070
 
@@ -1142,7 +1140,7 @@ def flash_attn_kvpacked_func(
1142
  alibi_slopes,
1143
  deterministic,
1144
  return_attn_probs,
1145
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1146
  )
1147
 
1148
 
@@ -1219,7 +1217,7 @@ def flash_attn_func(
1219
  alibi_slopes,
1220
  deterministic,
1221
  return_attn_probs,
1222
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1223
  )
1224
 
1225
 
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1062
  alibi_slopes,
1063
  deterministic,
1064
  return_attn_probs,
1065
+ torch.is_grad_enabled(),
1066
  )
1067
 
1068
 
 
1140
  alibi_slopes,
1141
  deterministic,
1142
  return_attn_probs,
1143
+ torch.is_grad_enabled(),
1144
  )
1145
 
1146
 
 
1217
  alibi_slopes,
1218
  deterministic,
1219
  return_attn_probs,
1220
+ torch.is_grad_enabled(),
1221
  )
1222
 
1223
 
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch29-cxx11-cu126-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,12 @@
1
  {
2
  "version": 1,
3
- "python-depends": []
4
- }
 
 
 
 
 
 
 
 
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "8.0",
9
+ "9.0"
10
+ ]
11
+ }
12
+ }
build/torch29-cxx11-cu128-x86_64-linux/_flash_attn2_588b404.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b3afc2eda58b3649ac67513c775ce1cb124e5498f8dbbbe4ef07db6857d56d3
3
- size 1037644608
 
 
 
 
build/torch29-cxx11-cu128-x86_64-linux/_flash_attn2_cuda_9f0ed09.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29b22b8c1bbd77125b6c82aca5fecfe0416d2f116be7b1e1a4638f76fe542a2e
3
+ size 1037644632
build/torch29-cxx11-cu128-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_588b404
3
- ops = torch.ops._flash_attn2_588b404
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_588b404::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_9f0ed09
3
+ ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_9f0ed09::{op_name}"
build/torch29-cxx11-cu128-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1064,7 +1062,7 @@ def flash_attn_qkvpacked_func(
1064
  alibi_slopes,
1065
  deterministic,
1066
  return_attn_probs,
1067
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1068
  )
1069
 
1070
 
@@ -1142,7 +1140,7 @@ def flash_attn_kvpacked_func(
1142
  alibi_slopes,
1143
  deterministic,
1144
  return_attn_probs,
1145
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1146
  )
1147
 
1148
 
@@ -1219,7 +1217,7 @@ def flash_attn_func(
1219
  alibi_slopes,
1220
  deterministic,
1221
  return_attn_probs,
1222
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1223
  )
1224
 
1225
 
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1062
  alibi_slopes,
1063
  deterministic,
1064
  return_attn_probs,
1065
+ torch.is_grad_enabled(),
1066
  )
1067
 
1068
 
 
1140
  alibi_slopes,
1141
  deterministic,
1142
  return_attn_probs,
1143
+ torch.is_grad_enabled(),
1144
  )
1145
 
1146
 
 
1217
  alibi_slopes,
1218
  deterministic,
1219
  return_attn_probs,
1220
+ torch.is_grad_enabled(),
1221
  )
1222
 
1223
 
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch29-cxx11-cu128-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,14 @@
1
  {
2
  "version": 1,
3
- "python-depends": []
4
- }
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "10.0",
9
+ "12.0",
10
+ "8.0",
11
+ "9.0"
12
+ ]
13
+ }
14
+ }
build/torch29-cxx11-cu130-x86_64-linux/_flash_attn2_588b404.abi3.so DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:568eb670747b578b865649894b7674d9053a2ba660ba2e491030c788e3d5936a
3
- size 1009019168
 
 
 
 
build/torch29-cxx11-cu130-x86_64-linux/_flash_attn2_cuda_9f0ed09.abi3.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9e000e77b2d5f5b8554c1ba0e1edfc173bd19d904b95eede3f9cc7ecefbcf89
3
+ size 1009019192
build/torch29-cxx11-cu130-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_588b404
3
- ops = torch.ops._flash_attn2_588b404
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_588b404::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_cuda_9f0ed09
3
+ ops = torch.ops._flash_attn2_cuda_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_cuda_9f0ed09::{op_name}"
build/torch29-cxx11-cu130-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1064,7 +1062,7 @@ def flash_attn_qkvpacked_func(
1064
  alibi_slopes,
1065
  deterministic,
1066
  return_attn_probs,
1067
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1068
  )
1069
 
1070
 
@@ -1142,7 +1140,7 @@ def flash_attn_kvpacked_func(
1142
  alibi_slopes,
1143
  deterministic,
1144
  return_attn_probs,
1145
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1146
  )
1147
 
1148
 
@@ -1219,7 +1217,7 @@ def flash_attn_func(
1219
  alibi_slopes,
1220
  deterministic,
1221
  return_attn_probs,
1222
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1223
  )
1224
 
1225
 
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1062
  alibi_slopes,
1063
  deterministic,
1064
  return_attn_probs,
1065
+ torch.is_grad_enabled(),
1066
  )
1067
 
1068
 
 
1140
  alibi_slopes,
1141
  deterministic,
1142
  return_attn_probs,
1143
+ torch.is_grad_enabled(),
1144
  )
1145
 
1146
 
 
1217
  alibi_slopes,
1218
  deterministic,
1219
  return_attn_probs,
1220
+ torch.is_grad_enabled(),
1221
  )
1222
 
1223
 
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch29-cxx11-cu130-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,14 @@
1
  {
2
  "version": 1,
3
- "python-depends": []
4
- }
 
 
 
 
 
 
 
 
 
 
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
+ "python-depends": [],
5
+ "backend": {
6
+ "type": "cuda",
7
+ "archs": [
8
+ "10.0",
9
+ "12.0",
10
+ "8.0",
11
+ "9.0"
12
+ ]
13
+ }
14
+ }
build/{torch210-cxx11-xpu20253-x86_64-linux/_flash_attn2_660971e.abi3.so → torch29-cxx11-xpu20252-x86_64-linux/_flash_attn2_xpu_9f0ed09.abi3.so} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29c0a6792979ae3880bd1cf1077d710b14fbd96b244665c75e4e7aaf65449a49
3
- size 15424416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:262841040bd11d2ea11f317107fdc9484d864db0377b423eb9007f4c8a7eb74f
3
+ size 13923672
build/torch29-cxx11-xpu20252-x86_64-linux/_ops.py CHANGED
@@ -1,9 +1,9 @@
1
  import torch
2
- from . import _flash_attn2_660971e
3
- ops = torch.ops._flash_attn2_660971e
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
- return f"_flash_attn2_660971e::{op_name}"
 
1
  import torch
2
+ from . import _flash_attn2_xpu_9f0ed09
3
+ ops = torch.ops._flash_attn2_xpu_9f0ed09
4
 
5
  def add_op_namespace_prefix(op_name: str):
6
  """
7
  Prefix op by namespace.
8
  """
9
+ return f"_flash_attn2_xpu_9f0ed09::{op_name}"
build/torch29-cxx11-xpu20252-x86_64-linux/flash_attn_interface.py CHANGED
@@ -31,8 +31,6 @@ def _get_device():
31
  else:
32
  return "cpu"
33
 
34
- _XPU_AVAILABLE = torch.xpu.is_available() if hasattr(torch, "xpu") else False # TODO remove hasattr check when bwd is supported on XPU
35
-
36
 
37
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
38
  # This should match the block sizes in the CUDA kernel
@@ -1285,7 +1283,7 @@ def flash_attn_varlen_qkvpacked_func(
1285
  alibi_slopes,
1286
  deterministic,
1287
  return_attn_probs,
1288
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1289
  )
1290
 
1291
 
@@ -1377,7 +1375,7 @@ def flash_attn_varlen_kvpacked_func(
1377
  alibi_slopes,
1378
  deterministic,
1379
  return_attn_probs,
1380
- False if _XPU_AVAILABLE else torch.is_grad_enabled(),
1381
  )
1382
 
1383
 
@@ -1471,7 +1469,7 @@ def flash_attn_varlen_func(
1471
  deterministic,
1472
  return_attn_probs,
1473
  block_table,
1474
- False if _XPU_AVAILABLE or q.device.type == "cpu" else torch.is_grad_enabled(),
1475
  )
1476
 
1477
 
 
31
  else:
32
  return "cpu"
33
 
 
 
34
 
35
  def _get_block_size_n(device, head_dim, is_dropout, is_causal):
36
  # This should match the block sizes in the CUDA kernel
 
1283
  alibi_slopes,
1284
  deterministic,
1285
  return_attn_probs,
1286
+ torch.is_grad_enabled(),
1287
  )
1288
 
1289
 
 
1375
  alibi_slopes,
1376
  deterministic,
1377
  return_attn_probs,
1378
+ torch.is_grad_enabled(),
1379
  )
1380
 
1381
 
 
1469
  deterministic,
1470
  return_attn_probs,
1471
  block_table,
1472
+ False if q.device.type == "cpu" else torch.is_grad_enabled(),
1473
  )
1474
 
1475
 
build/torch29-cxx11-xpu20252-x86_64-linux/metadata.json CHANGED
@@ -1,4 +1,5 @@
1
  {
2
  "version": 1,
 
3
  "python-depends": []
4
  }
 
1
  {
2
  "version": 1,
3
+ "license": "BSD-3-Clause",
4
  "python-depends": []
5
  }