admin管理员组

文章数量:1345017

I noticed a strange phenomenon while benchmarking the performances of my conv2d functions that were created with NumPy. As the input size increases, there could be a input size threshold that would cause a step slow down to the performance of the conv2d function. See this diagram where 1000 reruns per input length was done. Other than the first two methods, the latter fives methods all showed a significant slow after exceeding an input length of ~230 pixels.

After some investigation, I discovered that this phenomenon was unique to my conv2d functions that used the numpy.dot() method: the former 2 conv2d functions used the np.multiply() and np.sum() methods while the latter 5 conv2d functions used the numpy.dot() method.

Questions:

  1. Can you explain the reason for this slow down? Is this due to hardware limitation or some NumPy settings?
  2. Is there any ways to circumvent this slow down?

Below is a script to demonstrate the issue:

import numpy as np
from timeit import timeit
import matplotlib.pyplot as plt


def conv2d_np_as_strided_2d(inp: np.ndarray, ker: np.ndarray, pad: int, stride: int) -> np.ndarray:
    hi, wi = inp.shape
    hk, wk = ker.shape
    ho = (hi + 2 * pad - hk) // stride + 1
    wo = (wi + 2 * pad - wk) // stride + 1

    if pad > 0:
        inp = np.pad(inp, ((pad, pad), (pad, pad),), mode="constant", constant_values=0.0,)

    patches = np.lib.stride_tricks.as_strided(
        inp, shape=(ho, wo, hk, wk), 
        strides=(inp.strides[0] * stride, inp.strides[1] * stride, inp.strides[0], inp.strides[1],),
        writeable=False,
    )

    return np.dot(patches.reshape(ho * wo, ker.size), ker.flatten().T).reshape(ho, wo)


def get_func_average_runtime(rng, func, input_sizes, ksize, pad, stride, num):
    runtimes = np.zeros(len(input_sizes), dtype=np.float32)
    for n, isize in enumerate(input_sizes):
        inp = rng.random((isize, isize)).astype(np.float32)
        ker = rng.random((ksize, ksize)).astype(np.float32)
        runtimes[n] = timeit(lambda: func(inp, ker, pad, stride), number=num)

    return func.__name__, runtimes / num


def benchmark_conv2d():
    number = 30
    input_sizes = tuple(i for i in range(10, 302, 2))
    rng = np.random.default_rng()
    func_name, result = get_func_average_runtime(
            rng, conv2d_np_as_strided_2d, input_sizes, 3, 1, 1, number,
        )
    plt.plot(input_sizes, result, label=func_name)
    plt.xlabel("Input Size")
    plt.ylabel("Average Runtime (seconds)")
    plt.title("Average Runtime vs Array Size")
    plt.legend()
    plt.grid(True)
    plt.show()


benchmark_conv2d()

$ uname -srv
Linux 6.11.0-21-generic #21~24.04.1-Ubuntu SMP PREEMPT_DYNAMIC Mon Feb 24 16:52:15 UTC 2

$ uv run python --version
Python 3.12.3

$ uv tree
Resolved 18 packages in 1ms
test v0.1.0
├── matplotlib v3.10.1
│   ├── contourpy v1.3.1
│   │   └── numpy v2.2.4
│   ├── cycler v0.12.1
│   ├── fonttools v4.56.0
│   ├── kiwisolver v1.4.8
│   ├── numpy v2.2.4
│   ├── packaging v24.2
│   ├── pillow v11.2.0
│   ├── pyparsing v3.2.3
│   └── python-dateutil v2.9.0.post0
│       └── six v1.17.0
├── numpy v2.2.4
└── scikit-image v0.25.2
    ├── imageio v2.37.0
    │   ├── numpy v2.2.4
    │   └── pillow v11.2.0
    ├── lazy-loader v0.4
    │   └── packaging v24.2
    ├── networkx v3.4.2
    ├── numpy v2.2.4
    ├── packaging v24.2
    ├── pillow v11.2.0
    ├── scipy v1.15.2
    │   └── numpy v2.2.4
    └── tifffile v2025.3.30
        └── numpy v2.2.4

$ lscpu | grep name
Model name:                           Intel(R) Core(TM) i9-7960X CPU @ 2.80GHz

Output from np.show_config():

{
  "Compilers": {
    "c": {
      "name": "gcc",
      "linker": "ld.bfd",
      "version": "10.2.1",
      "commands": "cc"
    },
    "cython": {
      "name": "cython",
      "linker": "cython",
      "version": "3.0.12",
      "commands": "cython"
    },
    "c++": {
      "name": "gcc",
      "linker": "ld.bfd",
      "version": "10.2.1",
      "commands": "c++"
    }
  },
  "Machine Information": {
    "host": {
      "cpu": "x86_64",
      "family": "x86_64",
      "endian": "little",
      "system": "linux"
    },
    "build": {
      "cpu": "x86_64",
      "family": "x86_64",
      "endian": "little",
      "system": "linux"
    }
  },
  "Build Dependencies": {
    "blas": {
      "name": "scipy-openblas",
      "found": true,
      "version": "0.3.28",
      "detection method": "pkgconfig",
      "include directory": "/opt/_internal/cpython-3.12.7/lib/python3.12/site-packages/scipy_openblas64/include",
      "lib directory": "/opt/_internal/cpython-3.12.7/lib/python3.12/site-packages/scipy_openblas64/lib",
      "openblas configuration": "OpenBLAS 0.3.28  USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64",
      "pc file directory": "/project/.openblas"
    },
    "lapack": {
      "name": "scipy-openblas",
      "found": true,
      "version": "0.3.28",
      "detection method": "pkgconfig",
      "include directory": "/opt/_internal/cpython-3.12.7/lib/python3.12/site-packages/scipy_openblas64/include",
      "lib directory": "/opt/_internal/cpython-3.12.7/lib/python3.12/site-packages/scipy_openblas64/lib",
      "openblas configuration": "OpenBLAS 0.3.28  USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64",
      "pc file directory": "/project/.openblas"
    }
  },
  "Python Information": {
    "path": "/tmp/build-env-p680qjv9/bin/python",
    "version": "3.12"
  },
  "SIMD Extensions": {
    "baseline": [
      "SSE",
      "SSE2",
      "SSE3"
    ],
    "found": [
      "SSSE3",
      "SSE41",
      "POPCNT",
      "SSE42",
      "AVX",
      "F16C",
      "FMA3",
      "AVX2",
      "AVX512F",
      "AVX512CD",
      "AVX512_SKX"
    ],
    "not found": [
      "AVX512_KNL",
      "AVX512_KNM",
      "AVX512_CLX",
      "AVX512_CNL",
      "AVX512_ICL"
    ]
  }
}

本文标签: pythonHow to overcome slow down seen in numpydot of larger input sizes during convolutionStack Overflow