{"id":2259,"date":"2024-06-20T10:43:34","date_gmt":"2024-06-20T02:43:34","guid":{"rendered":"https:\/\/sanlangcode.com\/?p=2259"},"modified":"2024-06-20T10:43:34","modified_gmt":"2024-06-20T02:43:34","slug":"cuda%e7%bc%96%e7%a8%8b%e5%ad%a6%e4%b9%a0%e5%85%a5%e9%97%a8%e3%80%90%e8%bd%ac%e3%80%91","status":"publish","type":"post","link":"https:\/\/sanlangcode.com\/index.php\/2024\/06\/20\/cuda%e7%bc%96%e7%a8%8b%e5%ad%a6%e4%b9%a0%e5%85%a5%e9%97%a8%e3%80%90%e8%bd%ac%e3%80%91\/","title":{"rendered":"CUDA\u7f16\u7a0b\u5b66\u4e60\u5165\u95e8\u3010\u8f6c\u3011"},"content":{"rendered":"\n<p>\u82f1\u4f1f\u8fbe\u663e\u5361\u7b49\u7ea7\u662f\u6839\u636eCUDA\u8fdb\u884c\u5212\u5206\uff0c\u6bd4\u59824090\u5c31\u62e5\u670916384 \u4e2a CUDA\u6838\u5fc3\u3002CUDA\uff08Compute Unified Device Architecture\uff09\u7684\u4e2d\u6587\u5168\u79f0\u4e3a\u8ba1\u7b97\u7edf\u4e00\u8bbe\u5907\u67b6\u6784\u3002\u505a\u56fe\u50cf\u89c6\u89c9\u9886\u57df\u7684\u540c\u5b66\u591a\u591a\u5c11\u5c11\u90fd\u4f1a\u63a5\u89e6\u5230CUDA\uff0c\u6bd5\u7adf\u8981\u505a\u6027\u80fd\u901f\u5ea6\u4f18\u5316\uff0cCUDA\u662f\u4e2a\u5f88\u91cd\u8981\u7684\u5de5\u5177\uff0cCUDA\u662f\u505a\u89c6\u89c9\u7684\u540c\u5b66\u96be\u4ee5\u7ed5\u8fc7\u7684\u4e00\u4e2a\u5751\uff0c\u5fc5\u987b\u8e29\u4e00\u8e29\u624d\u8e0f\u5b9e\u3002CUDA\u7f16\u7a0b\u771f\u7684\u662f\u5165\u95e8\u5bb9\u6613\u7cbe\u901a\u96be\uff0c\u5177\u6709\u8ba1\u7b97\u673a\u4f53\u7cfb\u7ed3\u6784\u548cC\u8bed\u8a00\u7f16\u7a0b\u77e5\u8bc6\u50a8\u5907\u7684\u540c\u5b66\u4e0a\u624bCUDA\u7f16\u7a0b\u5e94\u8be5\u96be\u5ea6\u4e0d\u4f1a\u5f88\u5927\u3002\u672c\u6587\u7ae0\u5c06\u901a\u8fc7\u4ee5\u4e0b\u4e94\u4e2a\u65b9\u9762\u5e2e\u52a9\u5927\u5bb6\u6bd4\u8f83\u5168\u9762\u5730\u4e86\u89e3CUDA\u7f16\u7a0b\u6700\u91cd\u8981\u7684\u77e5\u8bc6\u70b9\uff0c\u505a\u5230\u5feb\u901f\u5165\u95e8\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\"><li>GPU\u67b6\u6784\u7279\u70b9<\/li><li>CUDA\u7ebf\u7a0b\u6a21\u578b<\/li><li>CUDA\u5185\u5b58\u6a21\u578b<\/li><li>CUDA\u7f16\u7a0b\u6a21\u578b<\/li><li>CUDA\u5e94\u7528\u5c0f\u4f8b\u5b50<\/li><\/ol>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"1-gpu\u67b6\u6784\u7279\u70b9\">1. GPU\u67b6\u6784\u7279\u70b9<\/h2>\n\n\n\n<p>\u9996\u5148\u6211\u4eec\u5148\u8c08\u4e00\u8c08\u4e32\u884c\u8ba1\u7b97\u548c\u5e76\u884c\u8ba1\u7b97\u3002\u6211\u4eec\u77e5\u9053\uff0c\u9ad8\u6027\u80fd\u8ba1\u7b97\u7684\u5173\u952e\u5229\u7528\u591a\u6838\u5904\u7406\u5668\u8fdb\u884c\u5e76\u884c\u8ba1\u7b97\u3002<\/p>\n\n\n\n<p>\u5f53\u6211\u4eec\u6c42\u89e3\u4e00\u4e2a\u8ba1\u7b97\u673a\u7a0b\u5e8f\u4efb\u52a1\u65f6\uff0c\u6211\u4eec\u5f88\u81ea\u7136\u7684\u60f3\u6cd5\u5c31\u662f\u5c06\u8be5\u4efb\u52a1\u5206\u89e3\u6210\u4e00\u7cfb\u5217\u5c0f\u4efb\u52a1\uff0c\u628a\u8fd9\u4e9b\u5c0f\u4efb\u52a1\u4e00\u4e00\u5b8c\u6210\u3002\u5728\u4e32\u884c\u8ba1\u7b97\u65f6\uff0c\u6211\u4eec\u7684\u60f3\u6cd5\u5c31\u662f\u8ba9\u6211\u4eec\u7684\u5904\u7406\u5668\u6bcf\u6b21\u5904\u7406\u4e00\u4e2a\u8ba1\u7b97\u4efb\u52a1\uff0c\u5904\u7406\u5b8c\u4e00\u4e2a\u8ba1\u7b97\u4efb\u52a1\u540e\u518d\u8ba1\u7b97\u4e0b\u4e00\u4e2a\u4efb\u52a1\uff0c\u76f4\u5230\u6240\u6709\u5c0f\u4efb\u52a1\u90fd\u5b8c\u6210\u4e86\uff0c\u90a3\u4e48\u8fd9\u4e2a\u5927\u7684\u7a0b\u5e8f\u4efb\u52a1\u4e5f\u5c31\u5b8c\u6210\u4e86\u3002<\/p>\n\n\n\n<p>\u4f46\u662f\u4e32\u884c\u8ba1\u7b97\u7684\u7f3a\u70b9\u975e\u5e38\u660e\u663e\uff0c\u5982\u679c\u6211\u4eec\u62e5\u6709\u591a\u6838\u5904\u7406\u5668\uff0c\u6211\u4eec\u53ef\u4ee5\u5229\u7528\u591a\u6838\u5904\u7406\u5668\u540c\u65f6\u5904\u7406\u591a\u4e2a\u4efb\u52a1\u65f6\uff0c\u800c\u4e14\u8fd9\u4e9b\u5c0f\u4efb\u52a1\u5e76\u6ca1\u6709\u5173\u8054\u5173\u7cfb\uff08\u4e0d\u9700\u8981\u76f8\u4e92\u4f9d\u8d56\uff0c\u6bd4\u5982\u6211\u7684\u8ba1\u7b97\u4efb\u52a1\u4e0d\u9700\u8981\u7528\u5230\u4f60\u7684\u8ba1\u7b97\u7ed3\u679c\uff09\uff0c\u90a3\u6211\u4eec\u4e3a\u4ec0\u4e48\u8fd8\u8981\u4f7f\u7528\u4e32\u884c\u7f16\u7a0b\u5462\uff1f\u4e3a\u4e86\u8fdb\u4e00\u6b65\u52a0\u5feb\u5927\u4efb\u52a1\u7684\u8ba1\u7b97\u901f\u5ea6\uff0c\u6211\u4eec\u53ef\u4ee5\u628a\u4e00\u4e9b\u72ec\u7acb\u7684\u6a21\u5757\u5206\u914d\u5230\u4e0d\u540c\u7684\u5904\u7406\u5668\u4e0a\u8fdb\u884c\u540c\u65f6\u8ba1\u7b97\uff08\u8fd9\u5c31\u662f\u5e76\u884c\uff09\uff0c\u6700\u540e\u518d\u5c06\u8fd9\u4e9b\u7ed3\u679c\u8fdb\u884c\u6574\u5408\uff0c\u5b8c\u6210\u4e00\u6b21\u4efb\u52a1\u8ba1\u7b97\u3002\u4e0b\u56fe\u5c31\u662f\u5c06\u4e00\u4e2a\u5927\u7684\u8ba1\u7b97\u4efb\u52a1\u5206\u89e3\u4e3a\u5c0f\u4efb\u52a1\uff0c\u7136\u540e\u5c06\u72ec\u7acb\u7684\u5c0f\u4efb\u52a1\u5206\u914d\u5230\u4e0d\u540c\u5904\u7406\u5668\u8fdb\u884c\u5e76\u884c\u8ba1\u7b97\uff0c\u6700\u540e\u518d\u901a\u8fc7\u4e32\u884c\u7a0b\u5e8f\u628a\u7ed3\u679c\u6c47\u603b\u5b8c\u6210\u8fd9\u6b21\u7684\u603b\u7684\u8ba1\u7b97\u4efb\u52a1\u3002<\/p>\n\n\n\n<p>\u6240\u4ee5\uff0c\u4e00\u4e2a\u7a0b\u5e8f\u53ef\u4e0d\u53ef\u4ee5\u8fdb\u884c\u5e76\u884c\u8ba1\u7b97\uff0c\u5173\u952e\u5c31\u5728\u4e8e\u6211\u4eec\u8981\u5206\u6790\u51fa\u8be5\u7a0b\u5e8f\u53ef\u4ee5\u62c6\u5206\u51fa\u54ea\u51e0\u4e2a\u6267\u884c\u6a21\u5757\uff0c\u8fd9\u4e9b\u6267\u884c\u6a21\u5757\u54ea\u4e9b\u662f\u72ec\u7acb\u7684\uff0c\u54ea\u4e9b\u53c8\u662f\u5f3a\u4f9d\u8d56\u5f3a\u8026\u5408\u7684\uff0c\u72ec\u7acb\u7684\u6a21\u5757\u6211\u4eec\u53ef\u4ee5\u8bd5\u7740\u8bbe\u8ba1\u5e76\u884c\u8ba1\u7b97\uff0c\u5145\u5206\u5229\u7528\u591a\u6838\u5904\u7406\u5668\u7684\u4f18\u52bf\u8fdb\u4e00\u6b65\u52a0\u901f\u6211\u4eec\u7684\u8ba1\u7b97\u4efb\u52a1\uff0c\u5f3a\u8026\u5408\u6a21\u5757\u6211\u4eec\u5c31\u4f7f\u7528\u4e32\u884c\u7f16\u7a0b\uff0c\u5229\u7528\u4e32\u884c+\u5e76\u884c\u7684\u7f16\u7a0b\u601d\u8def\u5b8c\u6210\u4e00\u6b21\u9ad8\u6027\u80fd\u8ba1\u7b97\u3002<\/p>\n\n\n\n<p>\u63a5\u4e0b\u6765\u6211\u4eec\u8c08\u8c08CPU\u548cGPU\u6709\u4ec0\u4e48\u533a\u522b\uff0c\u4ed6\u4eec\u4fe9\u5404\u81ea\u6709\u4ec0\u4e48\u7279\u70b9\uff0c\u6211\u4eec\u5728\u8c08\u5e76\u884c\u3001\u4e32\u884c\u8ba1\u7b97\u65f6\u591a\u6b21\u8c08\u5230\u201c\u591a\u6838\u201d\u7684\u6982\u5ff5\uff0c\u73b0\u5728\u6211\u4eec\u5148\u4ece\u201c\u6838\u201d\u7684\u89d2\u5ea6\u5f00\u59cb\u8fd9\u4e2a\u8bdd\u9898\u3002\u9996\u5148CPU\u662f\u4e13\u4e3a\u987a\u5e8f\u4e32\u884c\u5904\u7406\u800c\u4f18\u5316\u7684\u51e0\u4e2a\u6838\u5fc3\u7ec4\u6210\u3002\u800cGPU\u5219\u7531\u6570\u4ee5\u5343\u8ba1\u7684\u66f4\u5c0f\u3001\u66f4\u9ad8\u6548\u7684\u6838\u5fc3\u7ec4\u6210\uff0c\u8fd9\u4e9b\u6838\u5fc3\u4e13\u95e8\u4e3a\u540c\u65f6\u5904\u7406\u591a\u4efb\u52a1\u800c\u8bbe\u8ba1\uff0c\u53ef\u9ad8\u6548\u5730\u5904\u7406\u5e76\u884c\u4efb\u52a1\u3002\u4e5f\u5c31\u662f\uff0cCPU\u867d\u7136\u6bcf\u4e2a\u6838\u5fc3\u81ea\u8eab\u80fd\u529b\u6781\u5f3a\uff0c\u5904\u7406\u4efb\u52a1\u4e0a\u975e\u5e38\u5f3a\u608d\uff0c\u65e0\u5948\u4ed6\u6838\u5fc3\u5c11\uff0c\u5728\u5e76\u884c\u8ba1\u7b97\u4e0a\u8868\u73b0\u4e0d\u4f73\uff1b\u53cd\u89c2GPU\uff0c\u867d\u7136\u4ed6\u7684\u6bcf\u4e2a\u6838\u5fc3\u7684\u8ba1\u7b97\u80fd\u529b\u4e0d\u7b97\u5f3a\uff0c\u4f46\u4ed6\u80dc\u5728\u6838\u5fc3\u975e\u5e38\u591a\uff0c\u53ef\u4ee5\u540c\u65f6\u5904\u7406\u591a\u4e2a\u8ba1\u7b97\u4efb\u52a1\uff0c\u5728\u5e76\u884c\u8ba1\u7b97\u7684\u652f\u6301\u4e0a\u505a\u5f97\u5f88\u597d\u3002<\/p>\n\n\n\n<p>GPU\u548cCPU\u7684\u4e0d\u540c\u786c\u4ef6\u7279\u70b9\u51b3\u5b9a\u4e86\u4ed6\u4eec\u7684\u5e94\u7528\u573a\u666f\uff0cCPU\u662f\u8ba1\u7b97\u673a\u7684\u8fd0\u7b97\u548c\u63a7\u5236\u7684\u6838\u5fc3\uff0cGPU\u4e3b\u8981\u7528\u4f5c\u56fe\u5f62\u56fe\u50cf\u5904\u7406\u3002\u56fe\u50cf\u5728\u8ba1\u7b97\u673a\u5448\u73b0\u7684\u5f62\u5f0f\u5c31\u662f\u77e9\u9635\uff0c\u6211\u4eec\u5bf9\u56fe\u50cf\u7684\u5904\u7406\u5176\u5b9e\u5c31\u662f\u64cd\u4f5c\u5404\u79cd\u77e9\u9635\u8fdb\u884c\u8ba1\u7b97\uff0c\u800c\u5f88\u591a\u77e9\u9635\u7684\u8fd0\u7b97\u5176\u5b9e\u53ef\u4ee5\u505a\u5e76\u884c\u5316\uff0c\u8fd9\u4f7f\u5f97\u56fe\u50cf\u5904\u7406\u53ef\u4ee5\u505a\u5f97\u5f88\u5feb\uff0c\u56e0\u6b64GPU\u5728\u56fe\u5f62\u56fe\u50cf\u9886\u57df\u4e5f\u6709\u4e86\u5927\u5c55\u62f3\u811a\u7684\u673a\u4f1a\u3002\u4e0b\u56fe\u8868\u793a\u7684\u5c31\u662f\u4e00\u4e2a\u591aGPU\u8ba1\u7b97\u673a\u786c\u4ef6\u7cfb\u7edf\uff0c\u53ef\u4ee5\u770b\u51fa\uff0c\u4e00\u4e2aGPU\u5185\u5b58\u5c31\u6709\u5f88\u591a\u4e2aSP\u548c\u5404\u7c7b\u5185\u5b58\uff0c\u8fd9\u4e9b\u786c\u4ef6\u90fd\u662fGPU\u8fdb\u884c\u9ad8\u6548\u5e76\u884c\u8ba1\u7b97\u7684\u57fa\u7840\u3002<\/p>\n\n\n\n<p>\u73b0\u5728\u518d\u4ece\u6570\u636e\u5904\u7406\u7684\u89d2\u5ea6\u6765\u5bf9\u6bd4CPU\u548cGPU\u7684\u7279\u70b9\u3002CPU\u9700\u8981\u5f88\u5f3a\u7684\u901a\u7528\u6027\u6765\u5904\u7406\u5404\u79cd\u4e0d\u540c\u7684\u6570\u636e\u7c7b\u578b\uff0c\u6bd4\u5982\u6574\u578b\u3001\u6d6e\u70b9\u6570\u7b49\uff0c\u540c\u65f6\u5b83\u53c8\u5fc5\u987b\u64c5\u957f\u5904\u7406\u903b\u8f91\u5224\u65ad\u6240\u5bfc\u81f4\u7684\u5927\u91cf\u5206\u652f\u8df3\u8f6c\u548c\u4e2d\u65ad\u5904\u7406\uff0c\u6240\u4ee5CPU\u5176\u5b9e\u5c31\u662f\u4e00\u4e2a\u80fd\u529b\u5f88\u5f3a\u7684\u4f19\u8ba1\uff0c\u4ed6\u80fd\u628a\u5f88\u591a\u4e8b\u5904\u7406\u5f97\u59a5\u59a5\u5f53\u5f53\uff0c\u5f53\u7136\u5566\u6211\u4eec\u9700\u8981\u7ed9\u4ed6\u5f88\u591a\u8d44\u6e90\u4f9b\u4ed6\u4f7f\u7528\uff08\u5404\u79cd\u786c\u4ef6\uff09\uff0c\u8fd9\u4e5f\u5bfc\u81f4\u4e86CPU\u4e0d\u53ef\u80fd\u6709\u592a\u591a\u6838\u5fc3\uff08\u6838\u5fc3\u603b\u6570\u4e0d\u8d85\u8fc716\uff09\u3002\u800cGPU\u9762\u5bf9\u7684\u5219\u662f\u7c7b\u578b\u9ad8\u5ea6\u7edf\u4e00\u7684\u3001\u76f8\u4e92\u65e0\u4f9d\u8d56\u7684\u5927\u89c4\u6a21\u6570\u636e\u548c\u4e0d\u9700\u8981\u88ab\u6253\u65ad\u7684\u7eaf\u51c0\u7684\u8ba1\u7b97\u73af\u5883\uff0cGPU\u6709\u975e\u5e38\u591a\u6838\u5fc3\uff08\u8d39\u7c73\u67b6\u6784\u5c31\u6709512\u6838\uff09\uff0c\u867d\u7136\u5176\u6838\u5fc3\u7684\u80fd\u529b\u8fdc\u6ca1\u6709CPU\u7684\u6838\u5fc3\u5f3a\uff0c\u4f46\u662f\u80dc\u5728\u591a\uff0c<br>\u5728\u5904\u7406\u7b80\u5355\u8ba1\u7b97\u4efb\u52a1\u65f6\u5448\u73b0\u51fa\u201c\u4eba\u591a\u529b\u91cf\u5927\u201d\u7684\u4f18\u52bf\uff0c\u8fd9\u5c31\u662f\u5e76\u884c\u8ba1\u7b97\u7684\u9b45\u529b\u3002<\/p>\n\n\n\n<p>\u6574\u7406\u4e00\u4e0b\u4e24\u8005\u7279\u70b9\u5c31\u662f\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\"><li>CPU\uff1a\u64c5\u957f\u6d41\u7a0b\u63a7\u5236\u548c\u903b\u8f91\u5904\u7406\uff0c\u4e0d\u89c4\u5219\u6570\u636e\u7ed3\u6784\uff0c\u4e0d\u53ef\u9884\u6d4b\u5b58\u50a8\u7ed3\u6784\uff0c\u5355\u7ebf\u7a0b\u7a0b\u5e8f\uff0c\u5206\u652f\u5bc6\u96c6\u578b\u7b97\u6cd5<\/li><li>GPU\uff1a\u64c5\u957f\u6570\u636e\u5e76\u884c\u8ba1\u7b97\uff0c\u89c4\u5219\u6570\u636e\u7ed3\u6784\uff0c\u53ef\u9884\u6d4b\u5b58\u50a8\u6a21\u5f0f<\/li><\/ul>\n\n\n\n<p>\u73b0\u5728\u7684\u8ba1\u7b97\u673a\u4f53\u7cfb\u67b6\u6784\u4e2d\uff0c\u8981\u5b8c\u6210CUDA\u5e76\u884c\u8ba1\u7b97\uff0c\u5355\u9760GPU\u4e00\u4eba\u4e4b\u529b\u662f\u4e0d\u80fd\u5b8c\u6210\u8ba1\u7b97\u4efb\u52a1\u7684\uff0c\u5fc5\u987b\u501f\u52a9CPU\u6765\u534f\u540c\u914d\u5408\u5b8c\u6210\u4e00\u6b21\u9ad8\u6027\u80fd\u7684\u5e76\u884c\u8ba1\u7b97\u4efb\u52a1\u3002<\/p>\n\n\n\n<p>\u4e00\u822c\u800c\u8a00\uff0c\u5e76\u884c\u90e8\u5206\u5728GPU\u4e0a\u8fd0\u884c\uff0c\u4e32\u884c\u90e8\u5206\u5728CPU\u8fd0\u884c\uff0c\u8fd9\u5c31\u662f\u5f02\u6784\u8ba1\u7b97\u3002\u5177\u4f53\u4e00\u70b9\uff0c\u5f02\u6784\u8ba1\u7b97\u7684\u610f\u601d\u5c31\u662f\u4e0d\u540c\u4f53\u7cfb\u7ed3\u6784\u7684\u5904\u7406\u5668\u76f8\u4e92\u534f\u4f5c\u5b8c\u6210\u8ba1\u7b97\u4efb\u52a1\u3002CPU\u8d1f\u8d23\u603b\u4f53\u7684\u7a0b\u5e8f\u6d41\u7a0b\uff0c\u800cGPU\u8d1f\u8d23\u5177\u4f53\u7684\u8ba1\u7b97\u4efb\u52a1\uff0c\u5f53GPU\u5404\u4e2a\u7ebf\u7a0b\u5b8c\u6210\u8ba1\u7b97\u4efb\u52a1\u540e\uff0c\u6211\u4eec\u5c31\u5c06GPU\u90a3\u8fb9\u8ba1\u7b97\u5f97\u5230\u7684\u7ed3\u679c\u62f7\u8d1d\u5230CPU\u7aef\uff0c\u5b8c\u6210\u4e00\u6b21\u8ba1\u7b97\u4efb\u52a1\u3002<\/p>\n\n\n\n<p>\u6240\u4ee5\u5e94\u7528\u7a0b\u5e8f\u5229\u7528GPU\u5b9e\u73b0\u52a0\u901f\u7684\u603b\u4f53\u5206\u5de5\u5c31\u662f\uff1a\u5bc6\u96c6\u8ba1\u7b97\u4ee3\u7801\uff08\u7ea6\u53605%\u7684\u4ee3\u7801\u91cf\uff09\u7531GPU\u8d1f\u8d23\u5b8c\u6210\uff0c\u5269\u4f59\u4e32\u884c\u4ee3\u7801\u7531CPU\u8d1f\u8d23\u6267\u884c\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"2-cuda\u7ebf\u7a0b\u6a21\u578b\">2. CUDA\u7ebf\u7a0b\u6a21\u578b<\/h2>\n\n\n\n<p>\u4e0b\u9762\u6211\u4eec\u4ecb\u7ecdCUDA\u7684\u7ebf\u7a0b\u7ec4\u7ec7\u7ed3\u6784\u3002\u9996\u5148\u6211\u4eec\u90fd\u77e5\u9053\uff0c\u7ebf\u7a0b\u662f\u7a0b\u5e8f\u6267\u884c\u7684\u6700\u57fa\u672c\u5355\u5143\uff0cCUDA\u7684\u5e76\u884c\u8ba1\u7b97\u5c31\u662f\u901a\u8fc7\u6210\u5343\u4e0a\u4e07\u4e2a\u7ebf\u7a0b\u7684\u5e76\u884c\u6267\u884c\u6765\u5b9e\u73b0\u7684\u3002\u4e0b\u9762\u7684\u673a\u6784\u56fe\u8bf4\u660e\u4e86GPU\u7684\u4e0d\u540c\u5c42\u6b21\u7684\u7ed3\u6784\u3002<\/p>\n\n\n\n<p>CUDA\u7684\u7ebf\u7a0b\u6a21\u578b\u4ece\u5c0f\u5f80\u5927\u6765\u603b\u7ed3\u5c31\u662f\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\"><li>Thread\uff1a\u7ebf\u7a0b\uff0c\u5e76\u884c\u7684\u57fa\u672c\u5355\u4f4d<\/li><li>Thread Block\uff1a\u7ebf\u7a0b\u5757\uff0c\u4e92\u76f8\u5408\u4f5c\u7684\u7ebf\u7a0b\u7ec4\uff0c\u7ebf\u7a0b\u5757\u6709\u5982\u4e0b\u51e0\u4e2a\u7279\u70b9\uff1a<\/li><\/ol>\n\n\n\n<ul class=\"wp-block-list\"><li>\u5141\u8bb8\u5f7c\u6b64\u540c\u6b65<\/li><li>\u53ef\u4ee5\u901a\u8fc7\u5171\u4eab\u5185\u5b58\u5feb\u901f\u4ea4\u6362\u6570\u636e<\/li><li>\u4ee51\u7ef4\u30012\u7ef4\u62163\u7ef4\u7ec4\u7ec7<\/li><\/ul>\n\n\n\n<ol class=\"wp-block-list\" start=\"3\"><li>Grid\uff1a\u4e00\u7ec4\u7ebf\u7a0b\u5757<\/li><\/ol>\n\n\n\n<ul class=\"wp-block-list\"><li>\u4ee51\u7ef4\u30012\u7ef4\u7ec4\u7ec7<\/li><li>\u5171\u4eab\u5168\u5c40\u5185\u5b58<\/li><\/ul>\n\n\n\n<p>Kernel\uff1a\u5728GPU\u4e0a\u6267\u884c\u7684\u6838\u5fc3\u7a0b\u5e8f\uff0c\u8fd9\u4e2akernel\u51fd\u6570\u662f\u8fd0\u884c\u5728\u67d0\u4e2aGrid\u4e0a\u7684\u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\"><li>One kernel &lt;-&gt; One Grid<\/li><\/ul>\n\n\n\n<p>\u6bcf\u4e00\u4e2ablock\u548c\u6bcf\u4e2athread\u90fd\u6709\u81ea\u5df1\u7684ID\uff0c\u6211\u4eec\u901a\u8fc7\u76f8\u5e94\u7684\u7d22\u5f15\u627e\u5230\u76f8\u5e94\u7684\u7ebf\u7a0b\u548c\u7ebf\u7a0b\u5757\u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\"><li>threadIdx\uff0cblockIdx<\/li><li>Block ID: 1D or 2D<\/li><li>Thread ID: 1D, 2D or 3D<\/li><\/ul>\n\n\n\n<p>\u7406\u89e3kernel\uff0c\u5fc5\u987b\u8981\u5bf9kernel\u7684\u7ebf\u7a0b\u5c42\u6b21\u7ed3\u6784\u6709\u4e00\u4e2a\u6e05\u6670\u7684\u8ba4\u8bc6\u3002\u9996\u5148GPU\u4e0a\u5f88\u591a\u5e76\u884c\u5316\u7684\u8f7b\u91cf\u7ea7\u7ebf\u7a0b\u3002kernel\u5728device\u4e0a\u6267\u884c\u65f6\u5b9e\u9645\u4e0a\u662f\u542f\u52a8\u5f88\u591a\u7ebf\u7a0b\uff0c\u4e00\u4e2akernel\u6240\u542f\u52a8\u7684\u6240\u6709\u7ebf\u7a0b\u79f0\u4e3a\u4e00\u4e2a\u7f51\u683c\uff08grid\uff09\uff0c\u540c\u4e00\u4e2a\u7f51\u683c\u4e0a\u7684\u7ebf\u7a0b\u5171\u4eab\u76f8\u540c\u7684\u5168\u5c40\u5185\u5b58\u7a7a\u95f4\uff0cgrid\u662f\u7ebf\u7a0b\u7ed3\u6784\u7684\u7b2c\u4e00\u5c42\u6b21\uff0c\u800c\u7f51\u683c\u53c8\u53ef\u4ee5\u5206\u4e3a\u5f88\u591a\u7ebf\u7a0b\u5757\uff08block\uff09\uff0c\u4e00\u4e2a\u7ebf\u7a0b\u5757\u91cc\u9762\u5305\u542b\u5f88\u591a\u7ebf\u7a0b\uff0c\u8fd9\u662f\u7b2c\u4e8c\u4e2a\u5c42\u6b21\u3002\u7ebf\u7a0b\u4e24\u5c42\u7ec4\u7ec7\u7ed3\u6784\u5982\u4e0a\u56fe\u6240\u793a\uff0c\u8fd9\u662f\u4e00\u4e2agird\u548cblock\u5747\u4e3a2-dim\u7684\u7ebf\u7a0b\u7ec4\u7ec7\u3002grid\u548cblock\u90fd\u662f\u5b9a\u4e49\u4e3adim3\u7c7b\u578b\u7684\u53d8\u91cf\uff0cdim3\u53ef\u4ee5\u770b\u6210\u662f\u5305\u542b\u4e09\u4e2a\u65e0\u7b26\u53f7\u6574\u6570\uff08x\uff0cy\uff0cz\uff09\u6210\u5458\u7684\u7ed3\u6784\u4f53\u53d8\u91cf\uff0c\u5728\u5b9a\u4e49\u65f6\uff0c\u7f3a\u7701\u503c\u521d\u59cb\u5316\u4e3a1\u3002\u56e0\u6b64grid\u548cblock\u53ef\u4ee5\u7075\u6d3b\u5730\u5b9a\u4e49\u4e3a1-dim\uff0c2-dim\u4ee5\u53ca3-dim\u7ed3\u6784\uff0ckernel\u8c03\u7528\u65f6\u4e5f\u5fc5\u987b\u901a\u8fc7\u6267\u884c\u914d\u7f6e&lt;&lt;&lt;grid, block&gt;&gt;&gt;\u6765\u6307\u5b9akernel\u6240\u4f7f\u7528\u7684\u7f51\u683c\u7ef4\u5ea6\u548c\u7ebf\u7a0b\u5757\u7ef4\u5ea6\u3002\u4e3e\u4e2a\u4f8b\u5b50\uff0c\u6211\u4eec\u4ee5\u4e0a\u56fe\u4e3a\u4f8b\uff0c\u5206\u6790\u600e\u4e48\u901a\u8fc7&lt;&lt;&lt;grid,block&gt;&gt;&gt;&gt;\u8fd9\u79cd\u6807\u8bb0\u65b9\u5f0f\u7d22\u5f15\u5230\u6211\u4eec\u60f3\u8981\u7684\u90a3\u4e2a\u7ebf\u7a0b\u3002CUDA\u7684\u8fd9\u79cd&lt;&lt;&lt;grid,block&gt;&gt;&gt;\u5176\u5b9e\u5c31\u662f\u4e00\u4e2a\u591a\u7ea7\u7d22\u5f15\u7684\u65b9\u6cd5\uff0c\u7b2c\u4e00\u7ea7\u7d22\u5f15\u662f(grid.xIdx, grid.yIdy)\uff0c\u5bf9\u5e94\u4e0a\u56fe\u4f8b\u5b50\u5c31\u662f(1, 1)\uff0c\u901a\u8fc7\u5b83\u6211\u4eec\u5c31\u80fd\u627e\u5230\u4e86\u8fd9\u4e2a\u7ebf\u7a0b\u5757\u7684\u4f4d\u7f6e\uff0c\u7136\u540e\u6211\u4eec\u542f\u52a8\u4e8c\u7ea7\u7d22\u5f15(block.xIdx, block.yIdx, block.zIdx)\u6765\u5b9a\u4f4d\u5230\u6307\u5b9a\u7684\u7ebf\u7a0b\u3002\u8fd9\u5c31\u662f\u6211\u4eecCUDA\u7684\u7ebf\u7a0b\u7ec4\u7ec7\u7ed3\u6784\u3002<\/p>\n\n\n\n<p>\u8fd9\u91cc\u60f3\u8c08\u8c08SP\u548cSM\uff08\u6d41\u5904\u7406\u5668\uff09\uff0c\u5f88\u591a\u4eba\u4f1a\u88ab\u8fd9\u4e24\u4e2a\u4e13\u4e1a\u540d\u8bcd\u641e\u5f97\u6655\u5934\u8f6c\u5411\u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\"><li>SP\uff1a\u6700\u57fa\u672c\u7684\u5904\u7406\u5355\u5143\uff0cstreaming processor\uff0c\u4e5f\u79f0\u4e3aCUDA core\u3002\u6700\u540e\u5177\u4f53\u7684\u6307\u4ee4\u548c\u4efb\u52a1\u90fd\u662f\u5728SP\u4e0a\u5904\u7406\u7684\u3002GPU\u8fdb\u884c\u5e76\u884c\u8ba1\u7b97\uff0c\u4e5f\u5c31\u662f\u5f88\u591a\u4e2aSP\u540c\u65f6\u505a\u5904\u7406\u3002<\/li><li>SM\uff1a\u591a\u4e2aSP\u52a0\u4e0a\u5176\u4ed6\u7684\u4e00\u4e9b\u8d44\u6e90\u7ec4\u6210\u4e00\u4e2astreaming multiprocessor\u3002\u4e5f\u53ebGPU\u5927\u6838\uff0c\u5176\u4ed6\u8d44\u6e90\u5982\uff1awarp scheduler\uff0cregister\uff0cshared memory\u7b49\u3002SM\u53ef\u4ee5\u770b\u505aGPU\u7684\u5fc3\u810f\uff08\u5bf9\u6bd4CPU\u6838\u5fc3\uff09\uff0cregister\u548cshared memory\u662fSM\u7684\u7a00\u7f3a\u8d44\u6e90\u3002CUDA\u5c06\u8fd9\u4e9b\u8d44\u6e90\u5206\u914d\u7ed9\u6240\u6709\u9a7b\u7559\u5728SM\u4e2d\u7684threads\u3002\u56e0\u6b64\uff0c\u8fd9\u4e9b\u6709\u9650\u7684\u8d44\u6e90\u5c31\u4f7f\u6bcf\u4e2aSM\u4e2dactive warps\u6709\u975e\u5e38\u4e25\u683c\u7684\u9650\u5236\uff0c\u4e5f\u5c31\u9650\u5236\u4e86\u5e76\u884c\u80fd\u529b\u3002<\/li><\/ul>\n\n\n\n<p>\u9700\u8981\u6307\u51fa\uff0c\u6bcf\u4e2aSM\u5305\u542b\u7684SP\u6570\u91cf\u4f9d\u636eGPU\u67b6\u6784\u800c\u4e0d\u540c\uff0cFermi\u67b6\u6784GF100\u662f32\u4e2a\uff0cGF10X\u662f48\u4e2a\uff0cKepler\u67b6\u6784\u90fd\u662f192\u4e2a\uff0cMaxwell\u90fd\u662f128\u4e2a\u3002<\/p>\n\n\n\n<p>\u7b80\u800c\u8a00\u4e4b\uff0cSP\u662f\u7ebf\u7a0b\u6267\u884c\u7684\u786c\u4ef6\u5355\u4f4d\uff0cSM\u4e2d\u5305\u542b\u591a\u4e2aSP\uff0c\u4e00\u4e2aGPU\u53ef\u4ee5\u6709\u591a\u4e2aSM\uff08\u6bd4\u598216\u4e2a\uff09\uff0c\u6700\u7ec8\u4e00\u4e2aGPU\u53ef\u80fd\u5305\u542b\u6709\u4e0a\u5343\u4e2aSP\u3002\u8fd9\u4e48\u591a\u6838\u5fc3\u201c\u540c\u65f6\u8fd0\u884c\u201d\uff0c\u901f\u5ea6\u53ef\u60f3\u800c\u77e5\uff0c\u8fd9\u4e2a\u5f15\u53f7\u53ea\u662f\u60f3\u8868\u660e\u5b9e\u9645\u4e0a\uff0c\u8f6f\u4ef6\u903b\u8f91\u4e0a\u662f\u6240\u6709SP\u662f\u5e76\u884c\u7684\uff0c\u4f46\u662f\u7269\u7406\u4e0a\u5e76\u4e0d\u662f\u6240\u6709SP\u90fd\u80fd\u540c\u65f6\u6267\u884c\u8ba1\u7b97\uff08\u6bd4\u5982\u6211\u4eec\u53ea\u67098\u4e2aSM\u5374\u67091024\u4e2a\u7ebf\u7a0b\u5757\u9700\u8981\u8c03\u5ea6\u5904\u7406\uff09\uff0c\u56e0\u4e3a\u6709\u4e9b\u4f1a\u5904\u4e8e\u6302\u8d77\uff0c\u5c31\u7eea\u7b49\u5176\u4ed6\u72b6\u6001\uff0c\u8fd9\u6709\u5173GPU\u7684\u7ebf\u7a0b\u8c03\u5ea6\u3002<\/p>\n\n\n\n<p>\u4e0b\u9762\u8fd9\u4e2a\u56fe\u5c06\u4ece\u786c\u4ef6\u89d2\u5ea6\u548c\u8f6f\u4ef6\u89d2\u5ea6\u89e3\u91caCUDA\u7684\u7ebf\u7a0b\u6a21\u578b\u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\"><li>\u6bcf\u4e2a\u7ebf\u7a0b\u7531\u6bcf\u4e2a\u7ebf\u7a0b\u5904\u7406\u5668\uff08SP\uff09\u6267\u884c<\/li><li>\u7ebf\u7a0b\u5757\u7531\u591a\u6838\u5904\u7406\u5668\uff08SM\uff09\u6267\u884c<\/li><li>\u4e00\u4e2akernel\u5176\u5b9e\u7531\u4e00\u4e2agrid\u6765\u6267\u884c\uff0c\u4e00\u4e2akernel\u4e00\u6b21\u53ea\u80fd\u5728\u4e00\u4e2aGPU\u4e0a\u6267\u884c<\/li><\/ul>\n\n\n\n<p>block\u662f\u8f6f\u4ef6\u6982\u5ff5\uff0c\u4e00\u4e2ablock\u53ea\u4f1a\u7531\u4e00\u4e2asm\u8c03\u5ea6\uff0c\u7a0b\u5e8f\u5458\u5728\u5f00\u53d1\u65f6\uff0c\u901a\u8fc7\u8bbe\u5b9ablock\u7684\u5c5e\u6027\uff0c\u544a\u8bc9GPU\u786c\u4ef6\uff0c\u6211\u6709\u591a\u5c11\u4e2a\u7ebf\u7a0b\uff0c\u7ebf\u7a0b\u600e\u4e48\u7ec4\u7ec7\u3002\u800c\u5177\u4f53\u600e\u4e48\u8c03\u5ea6\u7531sm\u7684warps scheduler\u8d1f\u8d23\uff0cblock\u4e00\u65e6\u88ab\u5206\u914d\u597dSM\uff0c\u8be5block\u5c31\u4f1a\u4e00\u76f4\u9a7b\u7559\u5728\u8be5SM\u4e2d\uff0c\u76f4\u5230\u6267\u884c\u7ed3\u675f\u3002\u4e00\u4e2aSM\u53ef\u4ee5\u540c\u65f6\u62e5\u6709\u591a\u4e2ablocks\uff0c\u4f46\u9700\u8981\u5e8f\u5217\u6267\u884c\u3002\u4e0b\u56fe\u663e\u793a\u4e86GPU\u5185\u90e8\u7684\u786c\u4ef6\u67b6\u6784\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/img2018.cnblogs.com\/blog\/1093303\/201809\/1093303-20180919123048002-1383369419.png\" alt=\"\"\/><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"3-cuda\u5185\u5b58\u6a21\u578b\">3. CUDA\u5185\u5b58\u6a21\u578b<\/h2>\n\n\n\n<p>CUDA\u4e2d\u7684\u5185\u5b58\u6a21\u578b\u5206\u4e3a\u4ee5\u4e0b\u51e0\u4e2a\u5c42\u6b21\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\"><li>\u6bcf\u4e2a\u7ebf\u7a0b\u90fd\u7528\u81ea\u5df1\u7684registers\uff08\u5bc4\u5b58\u5668\uff09<\/li><li>\u6bcf\u4e2a\u7ebf\u7a0b\u90fd\u6709\u81ea\u5df1\u7684local memory\uff08\u5c40\u90e8\u5185\u5b58\uff09<\/li><li>\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u5185\u90fd\u6709\u81ea\u5df1\u7684shared memory\uff08\u5171\u4eab\u5185\u5b58\uff09\uff0c\u6240\u6709\u7ebf\u7a0b\u5757\u5185\u7684\u6240\u6709\u7ebf\u7a0b\u5171\u4eab\u8fd9\u6bb5\u5185\u5b58\u8d44\u6e90<\/li><li>\u6bcf\u4e2agrid\u90fd\u6709\u81ea\u5df1\u7684global memory\uff08\u5168\u5c40\u5185\u5b58\uff09\uff0c\u4e0d\u540c\u7ebf\u7a0b\u5757\u7684\u7ebf\u7a0b\u90fd\u53ef\u4f7f\u7528<\/li><li>\u6bcf\u4e2agrid\u90fd\u6709\u81ea\u5df1\u7684constant memory\uff08\u5e38\u91cf\u5185\u5b58\uff09\u548ctexture memory\uff08\u7eb9\u7406\u5185\u5b58\uff09\uff0c\uff09\uff0c\u4e0d\u540c\u7ebf\u7a0b\u5757\u7684\u7ebf\u7a0b\u90fd\u53ef\u4f7f\u7528<\/li><\/ul>\n\n\n\n<p>\u7ebf\u7a0b\u8bbf\u95ee\u8fd9\u51e0\u7c7b\u5b58\u50a8\u5668\u7684\u901f\u5ea6\u662fregister &gt; local memory &gt;shared memory &gt; global memory<\/p>\n\n\n\n<p>\u4e0b\u9762\u8fd9\u5e45\u56fe\u8868\u793a\u5c31\u662f\u8fd9\u4e9b\u5185\u5b58\u5728\u8ba1\u7b97\u673a\u67b6\u6784\u4e2d\u7684\u6240\u5728\u5c42\u6b21\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"4-cuda\u7f16\u7a0b\u6a21\u578b\">4. CUDA\u7f16\u7a0b\u6a21\u578b<\/h2>\n\n\n\n<p>\u4e0a\u9762\u8bb2\u4e86\u8fd9\u4e48\u591a\u786c\u4ef6\u76f8\u5173\u7684\u77e5\u8bc6\u70b9\uff0c\u73b0\u5728\u7ec8\u4e8e\u53ef\u4ee5\u5f00\u59cb\u8bf4\u8bf4CUDA\u662f\u600e\u4e48\u5199\u7a0b\u5e8f\u7684\u4e86\u3002<\/p>\n\n\n\n<p>\u6211\u4eec\u5148\u634b\u4e00\u634b\u5e38\u89c1\u7684CUDA\u672f\u8bed\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/img2018.cnblogs.com\/blog\/1093303\/201809\/1093303-20180919123113524-1183131017.png\" alt=\"\"\/><\/figure>\n\n\n\n<p>\u7b2c\u4e00\u4e2a\u8981\u638c\u63e1\u7684\u7f16\u7a0b\u8981\u70b9\uff1a\u6211\u4eec\u600e\u4e48\u5199\u4e00\u4e2a\u80fd\u5728GPU\u8dd1\u7684\u7a0b\u5e8f\u6216\u51fd\u6570\u5462\uff1f<\/p>\n\n\n\n<p>\u901a\u8fc7\u5173\u952e\u5b57\u5c31\u53ef\u4ee5\u8868\u793a\u67d0\u4e2a\u7a0b\u5e8f\u5728CPU\u4e0a\u8dd1\u8fd8\u662f\u5728GPU\u4e0a\u8dd1\uff01\u5982\u4e0b\u8868\u6240\u793a\uff0c\u6bd4\u5982\u6211\u4eec\u7528__global__\u5b9a\u4e49\u4e00\u4e2akernel\u51fd\u6570\uff0c\u5c31\u662fCPU\u4e0a\u8c03\u7528\uff0cGPU\u4e0a\u6267\u884c\uff0c\u6ce8\u610f__global__\u51fd\u6570\u7684\u8fd4\u56de\u503c\u5fc5\u987b\u8bbe\u7f6e\u4e3avoid\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/img2018.cnblogs.com\/blog\/1093303\/201809\/1093303-20180919123125957-1702896390.png\" alt=\"\"\/><\/figure>\n\n\n\n<p>\u7b2c\u4e8c\u4e2a\u7f16\u7a0b\u8981\u70b9\uff1aCPU\u548cGPU\u95f4\u7684\u6570\u636e\u4f20\u8f93\u600e\u4e48\u5199\uff1f<\/p>\n\n\n\n<p>\u9996\u5148\u4ecb\u7ecd\u5728GPU\u5185\u5b58\u5206\u914d\u56de\u6536\u5185\u5b58\u7684\u51fd\u6570\u63a5\u53e3\uff1a<\/p>\n\n\n\n<ul class=\"wp-block-list\"><li>cudaMalloc(): \u5728\u8bbe\u5907\u7aef\u5206\u914dglobal memory<\/li><li>cudaFree(): \u91ca\u653e\u5b58\u50a8\u7a7a\u95f4<\/li><\/ul>\n\n\n\n<p>CPU\u7684\u6570\u636e\u548cGPU\u7aef\u6570\u636e\u505a\u6570\u636e\u4f20\u8f93\u7684\u51fd\u6570\u63a5\u53e3\u662f\u4e00\u6837\u7684\uff0c\u4ed6\u4eec\u901a\u8fc7\u4f20\u9012\u7684\u51fd\u6570\u5b9e\u53c2\uff08\u679a\u4e3e\u7c7b\u578b\uff09\u6765\u8868\u793a\u4f20\u8f93\u65b9\u5411\uff1a<\/p>\n\n\n\n<p>cudaMemcpy(void *dst, void *src, size_t nbytes,<br>enum cudaMemcpyKind direction)<\/p>\n\n\n\n<p>enum cudaMemcpyKind:<\/p>\n\n\n\n<ul class=\"wp-block-list\"><li>cudaMemcpyHostToDevice\uff08CPU\u5230GPU\uff09<\/li><li>cudaMemcpyDeviceToHost\uff08GPU\u5230CPU\uff09<\/li><li>cudaMemcpyDeviceToDevice\uff08GPU\u5230GPU\uff09<\/li><\/ul>\n\n\n\n<p>\u7b2c\u4e09\u4e2a\u7f16\u7a0b\u8981\u70b9\u662f\uff1a\u600e\u4e48\u7528\u4ee3\u7801\u8868\u793a\u7ebf\u7a0b\u7ec4\u7ec7\u6a21\u578b\uff1f<br>\u6211\u4eec\u53ef\u4ee5\u7528dim3\u7c7b\u6765\u8868\u793a\u7f51\u683c\u548c\u7ebf\u7a0b\u5757\u7684\u7ec4\u7ec7\u65b9\u5f0f\uff0c\u7f51\u683cgrid\u53ef\u4ee5\u8868\u793a\u4e3a\u4e00\u7ef4\u548c\u4e8c\u7ef4\u683c\u5f0f\uff0c\u7ebf\u7a0b\u5757block\u53ef\u4ee5\u8868\u793a\u4e3a\u4e00\u7ef4\u3001\u4e8c\u7ef4\u548c\u4e09\u7ef4\u7684\u6570\u636e\u683c\u5f0f\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>dim3 DimGrid(100, 50);  \/\/5000\u4e2a\u7ebf\u7a0b\u5757\uff0c\u7ef4\u5ea6\u662f100*50\ndim3 DimBlock(4, 8, 8);  \/\/\u6bcf\u4e2a\u7ebf\u5c42\u5757\u5185\u5305\u542b256\u4e2a\u7ebf\u7a0b\uff0c\u7ebf\u7a0b\u5757\u5185\u7684\u7ef4\u5ea6\u662f4*8*8\n\n<\/code><\/pre>\n\n\n\n<p>\u63a5\u4e0b\u6765\u4ecb\u7ecd\u4e00\u4e2a\u975e\u5e38\u91cd\u8981\u53c8\u5f88\u96be\u61c2\u7684\u4e00\u4e2a\u77e5\u8bc6\u70b9\uff0c\u6211\u4eec\u600e\u4e48\u8ba1\u7b97\u7ebf\u7a0b\u53f7\u5462\uff1f<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"1\u4f7f\u7528n\u4e2a\u7ebf\u7a0b\u5757\u6bcf\u4e00\u4e2a\u7ebf\u7a0b\u5757\u53ea\u6709\u4e00\u4e2a\u7ebf\u7a0b\u5373\">1.\u4f7f\u7528N\u4e2a\u7ebf\u7a0b\u5757\uff0c\u6bcf\u4e00\u4e2a\u7ebf\u7a0b\u5757\u53ea\u6709\u4e00\u4e2a\u7ebf\u7a0b\uff0c\u5373<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>dim3 dimGrid(N);\ndim3 dimBlock(1);\n\n<\/code><\/pre>\n\n\n\n<p>\u6b64\u65f6\u7684\u7ebf\u7a0b\u53f7\u7684\u8ba1\u7b97\u65b9\u5f0f\u5c31\u662f<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>threadId = blockIdx.x;\n<\/code><\/pre>\n\n\n\n<p>\u5176\u4e2dthreadId\u7684\u53d6\u503c\u8303\u56f4\u4e3a0\u5230N-1\u3002\u5bf9\u4e8e\u8fd9\u79cd\u60c5\u51b5\uff0c\u6211\u4eec\u53ef\u4ee5\u5c06\u5176\u770b\u4f5c\u662f\u4e00\u4e2a\u5217\u5411\u91cf\uff0c\u5217\u5411\u91cf\u4e2d\u7684\u6bcf\u4e00\u884c\u5bf9\u5e94\u4e00\u4e2a\u7ebf\u7a0b\u5757\u3002\u5217\u5411\u91cf\u4e2d\u6bcf\u4e00\u884c\u53ea\u67091\u4e2a\u5143\u7d20\uff0c\u5bf9\u5e94\u4e00\u4e2a\u7ebf\u7a0b\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"2\u4f7f\u7528mn\u4e2a\u7ebf\u7a0b\u5757\u6bcf\u4e2a\u7ebf\u7a0b\u57571\u4e2a\u7ebf\u7a0b\">2.\u4f7f\u7528M\u00d7N\u4e2a\u7ebf\u7a0b\u5757\uff0c\u6bcf\u4e2a\u7ebf\u7a0b\u57571\u4e2a\u7ebf\u7a0b<\/h3>\n\n\n\n<p>\u7531\u4e8e\u7ebf\u7a0b\u5757\u662f2\u7ef4\u7684\uff0c\u6545\u53ef\u4ee5\u770b\u505a\u662f\u4e00\u4e2aM*N\u76842\u7ef4\u77e9\u9635\uff0c\u5176\u7ebf\u7a0b\u53f7\u6709\u4e24\u4e2a\u7ef4\u5ea6\uff0c\u5373\uff1a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>dim3 dimGrid(M,N);\ndim3 dimBlock(1);\n\n<\/code><\/pre>\n\n\n\n<p>\u5176\u4e2d<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>blockIdx.x \u53d6\u503c0\u5230M-1\nblcokIdx.y \u53d6\u503c0\u5230N-1\n\n<\/code><\/pre>\n\n\n\n<p>\u8fd9\u79cd\u60c5\u51b5\u4e00\u822c\u7528\u4e8e\u5904\u74062\u7ef4\u6570\u636e\u7ed3\u6784\uff0c\u6bd4\u59822\u7ef4\u56fe\u50cf\u3002\u6bcf\u4e00\u4e2a\u50cf\u7d20\u7528\u4e00\u4e2a\u7ebf\u7a0b\u6765\u5904\u7406\uff0c\u6b64\u65f6\u9700\u8981\u7ebf\u7a0b\u53f7\u6765\u6620\u5c04\u56fe\u50cf\u50cf\u7d20\u7684\u5bf9\u5e94\u4f4d\u7f6e\uff0c\u5982<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>pos = blockIdx.y * blcokDim.x + blockIdx.x; \/\/\u5176\u4e2dgridDim.x\u7b49\u4e8eM\n<\/code><\/pre>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"3\u4f7f\u7528\u4e00\u4e2a\u7ebf\u7a0b\u5757\u8be5\u7ebf\u7a0b\u5177\u6709n\u4e2a\u7ebf\u7a0b\u5373\">3.\u4f7f\u7528\u4e00\u4e2a\u7ebf\u7a0b\u5757\uff0c\u8be5\u7ebf\u7a0b\u5177\u6709N\u4e2a\u7ebf\u7a0b\uff0c\u5373<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>dim3 dimGrid(1);\ndim3 dimBlock(N);\n<\/code><\/pre>\n\n\n\n<p>\u6b64\u65f6\u7ebf\u7a0b\u53f7\u7684\u8ba1\u7b97\u65b9\u5f0f\u4e3a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>threadId = threadIdx.x;\n<\/code><\/pre>\n\n\n\n<p>\u5176\u4e2dthreadId\u7684\u8303\u56f4\u662f0\u5230N-1\uff0c\u5bf9\u4e8e\u8fd9\u79cd\u60c5\u51b5\uff0c\u53ef\u4ee5\u770b\u505a\u662f\u4e00\u4e2a\u884c\u5411\u91cf\uff0c\u884c\u5411\u91cf\u4e2d\u7684\u6bcf\u4e00\u4e2a\u5143\u7d20\u7684\u6bcf\u4e00\u4e2a\u5143\u7d20\u5bf9\u5e94\u7740\u4e00\u4e2a\u7ebf\u7a0b\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"4\u4f7f\u7528m\u4e2a\u7ebf\u7a0b\u5757\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u5185\u542b\u6709n\u4e2a\u7ebf\u7a0b\u5373\">4.\u4f7f\u7528M\u4e2a\u7ebf\u7a0b\u5757\uff0c\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u5185\u542b\u6709N\u4e2a\u7ebf\u7a0b\uff0c\u5373<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>dim3 dimGrid(M);\ndim3 dimBlock(N);\n<\/code><\/pre>\n\n\n\n<p>\u8fd9\u79cd\u60c5\u51b5\uff0c\u53ef\u4ee5\u628a\u5b83\u60f3\u8c61\u6210\u4e8c\u7ef4\u77e9\u9635\uff0c\u77e9\u9635\u7684\u884c\u4e0e\u7ebf\u7a0b\u5757\u5bf9\u5e94\uff0c\u77e9\u9635\u7684\u5217\u4e0e\u7ebf\u7a0b\u7f16\u53f7\u5bf9\u5e94\uff0c\u90a3\u7ebf\u7a0b\u53f7\u7684\u8ba1\u7b97\u65b9\u5f0f\u4e3a<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>threadId = threadIdx.x + blcokIdx*blockDim.x;\n<\/code><\/pre>\n\n\n\n<p>\u4e0a\u9762\u5176\u5b9e\u5c31\u662f\u628a\u4e8c\u7ef4\u7684\u7d22\u5f15\u7a7a\u95f4\u8f6c\u6362\u4e3a\u4e00\u7ef4\u7d22\u5f15\u7a7a\u95f4\u7684\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"5\u4f7f\u7528mn\u7684\u4e8c\u7ef4\u7ebf\u7a0b\u5757\u6bcf\u4e00\u4e2a\u7ebf\u7a0b\u5757\u5177\u6709pq\u4e2a\u7ebf\u7a0b\u5373\">5.\u4f7f\u7528M\u00d7N\u7684\u4e8c\u7ef4\u7ebf\u7a0b\u5757\uff0c\u6bcf\u4e00\u4e2a\u7ebf\u7a0b\u5757\u5177\u6709P\u00d7Q\u4e2a\u7ebf\u7a0b\uff0c\u5373<\/h3>\n\n\n\n<pre class=\"wp-block-code\"><code>dim3 dimGrid(M, N);\ndim3 dimBlock(P, Q);\n<\/code><\/pre>\n\n\n\n<p>\u8fd9\u79cd\u60c5\u51b5\u5176\u5b9e\u662f\u6211\u4eec\u9047\u5230\u7684\u6700\u591a\u60c5\u51b5\uff0c\u7279\u522b\u9002\u7528\u4e8e\u5904\u7406\u5177\u6709\u4e8c\u7ef4\u6570\u636e\u7ed3\u6784\u7684\u7b97\u6cd5\uff0c\u6bd4\u5982\u56fe\u50cf\u5904\u7406\u9886\u57df\u3002<\/p>\n\n\n\n<p>\u5176\u7d22\u5f15\u6709\u4e24\u4e2a\u7ef4\u5ea6<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>threadId.x = blockIdx.x*blockDim.x+threadIdx.x;\nthreadId.y = blockIdx.y*blockDim.y+threadIdx.y;\n<\/code><\/pre>\n\n\n\n<p>\u4e0a\u8ff0\u516c\u5f0f\u5c31\u662f\u628a\u7ebf\u7a0b\u548c\u7ebf\u7a0b\u5757\u7684\u7d22\u5f15\u6620\u5c04\u4e3a\u56fe\u50cf\u50cf\u7d20\u5750\u6807\u7684\u8ba1\u7b97\u65b9\u6cd5\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"cuda\u5e94\u7528\u4f8b\u5b50\">CUDA\u5e94\u7528\u4f8b\u5b50<\/h2>\n\n\n\n<p>\u6211\u4eec\u5df2\u7ecf\u638c\u63e1\u4e86CUDA\u7f16\u7a0b\u7684\u57fa\u672c\u8bed\u6cd5\uff0c\u73b0\u5728\u6211\u4eec\u5f00\u59cb\u4ee5\u4e00\u4e9b\u5c0f\u4f8b\u5b50\u6765\u771f\u6b63\u4e0a\u624bCUDA\u3002<\/p>\n\n\n\n<p>\u9996\u5148\u6211\u4eec\u7f16\u5199\u4e00\u4e2a\u7a0b\u5e8f\uff0c\u67e5\u770b\u6211\u4eecGPU\u7684\u4e00\u4e9b\u786c\u4ef6\u914d\u7f6e\u60c5\u51b5\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>#include \"device_launch_parameters.h\"\n#include &lt;iostream&gt;\n\nint main()\n{\n    int deviceCount;\n    cudaGetDeviceCount(&amp;deviceCount);\n    for(int i=0;i&lt;deviceCount;i++)\n    {\n        cudaDeviceProp devProp;\n        cudaGetDeviceProperties(&amp;devProp, i);\n        std::cout &lt;&lt; \"\u4f7f\u7528GPU device \" &lt;&lt; i &lt;&lt; \": \" &lt;&lt; devProp.name &lt;&lt; std::endl;\n        std::cout &lt;&lt; \"\u8bbe\u5907\u5168\u5c40\u5185\u5b58\u603b\u91cf\uff1a \" &lt;&lt; devProp.totalGlobalMem \/ 1024 \/ 1024 &lt;&lt; \"MB\" &lt;&lt; std::endl;\n        std::cout &lt;&lt; \"SM\u7684\u6570\u91cf\uff1a\" &lt;&lt; devProp.multiProcessorCount &lt;&lt; std::endl;\n        std::cout &lt;&lt; \"\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u7684\u5171\u4eab\u5185\u5b58\u5927\u5c0f\uff1a\" &lt;&lt; devProp.sharedMemPerBlock \/ 1024.0 &lt;&lt; \" KB\" &lt;&lt; std::endl;\n        std::cout &lt;&lt; \"\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u7684\u6700\u5927\u7ebf\u7a0b\u6570\uff1a\" &lt;&lt; devProp.maxThreadsPerBlock &lt;&lt; std::endl;\n        std::cout &lt;&lt; \"\u8bbe\u5907\u4e0a\u4e00\u4e2a\u7ebf\u7a0b\u5757\uff08Block\uff09\u79cd\u53ef\u7528\u768432\u4f4d\u5bc4\u5b58\u5668\u6570\u91cf\uff1a \" &lt;&lt; devProp.regsPerBlock &lt;&lt; std::endl;\n        std::cout &lt;&lt; \"\u6bcf\u4e2aEM\u7684\u6700\u5927\u7ebf\u7a0b\u6570\uff1a\" &lt;&lt; devProp.maxThreadsPerMultiProcessor &lt;&lt; std::endl;\n        std::cout &lt;&lt; \"\u6bcf\u4e2aEM\u7684\u6700\u5927\u7ebf\u7a0b\u675f\u6570\uff1a\" &lt;&lt; devProp.maxThreadsPerMultiProcessor \/ 32 &lt;&lt; std::endl;\n        std::cout &lt;&lt; \"\u8bbe\u5907\u4e0a\u591a\u5904\u7406\u5668\u7684\u6570\u91cf\uff1a \" &lt;&lt; devProp.multiProcessorCount &lt;&lt; std::endl;\n        std::cout &lt;&lt; \"======================================================\" &lt;&lt; std::endl;     \n        \n    }\n    return 0;\n}\n\n\n<\/code><\/pre>\n\n\n\n<p>\u6211\u4eec\u5229\u7528nvcc\u6765\u7f16\u8bd1\u7a0b\u5e8f\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>nvcc test1.cu -o test1\n<\/code><\/pre>\n\n\n\n<p>\u8f93\u51fa\u7ed3\u679c\uff1a\u56e0\u4e3a\u6211\u7684\u670d\u52a1\u5668\u662f8\u4e2aTITAN GPU\uff0c\u4e3a\u4e86\u7701\u7565\u91cd\u590d\u4fe1\u606f\uff0c\u4e0b\u9762\u53ea\u663e\u793a\u4e24\u4e2aGPU\u7ed3\u679c<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>\u4f7f\u7528GPU device 0: TITAN X (Pascal)\n\u8bbe\u5907\u5168\u5c40\u5185\u5b58\u603b\u91cf\uff1a 12189MB\nSM\u7684\u6570\u91cf\uff1a28\n\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u7684\u5171\u4eab\u5185\u5b58\u5927\u5c0f\uff1a48 KB\n\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u7684\u6700\u5927\u7ebf\u7a0b\u6570\uff1a1024\n\u8bbe\u5907\u4e0a\u4e00\u4e2a\u7ebf\u7a0b\u5757\uff08Block\uff09\u79cd\u53ef\u7528\u768432\u4f4d\u5bc4\u5b58\u5668\u6570\u91cf\uff1a 65536\n\u6bcf\u4e2aEM\u7684\u6700\u5927\u7ebf\u7a0b\u6570\uff1a2048\n\u6bcf\u4e2aEM\u7684\u6700\u5927\u7ebf\u7a0b\u675f\u6570\uff1a64\n\u8bbe\u5907\u4e0a\u591a\u5904\u7406\u5668\u7684\u6570\u91cf\uff1a 28\n======================================================\n\u4f7f\u7528GPU device 1: TITAN X (Pascal)\n\u8bbe\u5907\u5168\u5c40\u5185\u5b58\u603b\u91cf\uff1a 12189MB\nSM\u7684\u6570\u91cf\uff1a28\n\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u7684\u5171\u4eab\u5185\u5b58\u5927\u5c0f\uff1a48 KB\n\u6bcf\u4e2a\u7ebf\u7a0b\u5757\u7684\u6700\u5927\u7ebf\u7a0b\u6570\uff1a1024\n\u8bbe\u5907\u4e0a\u4e00\u4e2a\u7ebf\u7a0b\u5757\uff08Block\uff09\u79cd\u53ef\u7528\u768432\u4f4d\u5bc4\u5b58\u5668\u6570\u91cf\uff1a 65536\n\u6bcf\u4e2aEM\u7684\u6700\u5927\u7ebf\u7a0b\u6570\uff1a2048\n\u6bcf\u4e2aEM\u7684\u6700\u5927\u7ebf\u7a0b\u675f\u6570\uff1a64\n\u8bbe\u5907\u4e0a\u591a\u5904\u7406\u5668\u7684\u6570\u91cf\uff1a 28\n======================================================\n\n.......\n<\/code><\/pre>\n\n\n\n<p>\u7b2c\u4e00\u4e2a\u8ba1\u7b97\u4efb\u52a1\uff1a\u5c06\u4e24\u4e2a\u5143\u7d20\u6570\u76ee\u4e3a1024\u00d71024\u7684float\u6570\u7ec4\u76f8\u52a0\u3002<\/p>\n\n\n\n<p>\u9996\u5148\u6211\u4eec\u601d\u8003\u4e00\u4e0b\u5982\u679c\u53ea\u7528CPU\u6211\u4eec\u600e\u4e48\u4e32\u884c\u5b8c\u6210\u8fd9\u4e2a\u4efb\u52a1\u3002<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>#include &lt;iostream&gt;\n#include &lt;stdlib.h&gt;\n#include &lt;sys\/time.h&gt;\n#include &lt;math.h&gt;\n\nusing namespace std;\n\nint main()\n{\n    struct timeval start, end;\n    gettimeofday( &amp;start, NULL );\n    float*A, *B, *C;\n    int n = 1024 * 1024;\n    int size = n * sizeof(float);\n    A = (float*)malloc(size);\n    B = (float*)malloc(size);\n    C = (float*)malloc(size);\n\n    for(int i=0;i&lt;n;i++)\n    {\n        A&#91;i] = 90.0;\n        B&#91;i] = 10.0;\n    }\n    \n    for(int i=0;i&lt;n;i++)\n    {\n        C&#91;i] = A&#91;i] + B&#91;i];\n    }\n\n    float max_error = 0.0;\n    for(int i=0;i&lt;n;i++)\n    {\n        max_error += fabs(100.0-C&#91;i]);\n    }\n    cout &lt;&lt; \"max_error is \" &lt;&lt; max_error &lt;&lt; endl;\n    gettimeofday( &amp;end, NULL );\n    int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;\n    cout &lt;&lt; \"total time is \" &lt;&lt; timeuse\/1000 &lt;&lt; \"ms\" &lt;&lt;endl;\n    return 0;\n}\n<\/code><\/pre>\n\n\n\n<p>CPU\u65b9\u5f0f\u8f93\u51fa\u7ed3\u679c<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>max_error is 0\ntotal time is 22ms\n<\/code><\/pre>\n\n\n\n<p>\u5982\u679c\u6211\u4eec\u4f7f\u7528GPU\u6765\u505a\u5e76\u884c\u8ba1\u7b97\uff0c\u901f\u5ea6\u5c06\u4f1a\u5982\u4f55\u5462\uff1f<\/p>\n\n\n\n<p>\u7f16\u7a0b\u8981\u70b9\uff1a<\/p>\n\n\n\n<ol class=\"wp-block-list\"><li>\u6bcf\u4e2aBlock\u4e2d\u7684Thread\u6570\u6700\u5927\u4e0d\u8d85\u8fc7512\uff1b<\/li><li>\u4e3a\u4e86\u5145\u5206\u5229\u7528SM\uff0cBlock\u6570\u5c3d\u53ef\u80fd\u591a\uff0c&gt;100\u3002<\/li><\/ol>\n\n\n\n<pre class=\"wp-block-code\"><code>#include \"cuda_runtime.h\"\n#include &lt;stdlib.h&gt;\n#include &lt;iostream&gt;\n#include &lt;sys\/time.h&gt;\n\nusing namespace std;\n\n__global__ void Plus(float A&#91;], float B&#91;], float C&#91;], int n)\n{\n    int i = blockDim.x * blockIdx.x + threadIdx.x;\n    C&#91;i] = A&#91;i] + B&#91;i];\n}\n\nint main()\n{\n    struct timeval start, end;\n    gettimeofday( &amp;start, NULL );\n    float*A, *Ad, *B, *Bd, *C, *Cd;\n    int n = 1024 * 1024;\n    int size = n * sizeof(float);\n\n    \/\/ CPU\u7aef\u5206\u914d\u5185\u5b58\n    A = (float*)malloc(size);\n    B = (float*)malloc(size);\n    C = (float*)malloc(size);\n\n    \/\/ \u521d\u59cb\u5316\u6570\u7ec4\n    for(int i=0;i&lt;n;i++)\n    {\n        A&#91;i] = 90.0;\n        B&#91;i] = 10.0;\n    }\n\n    \/\/ GPU\u7aef\u5206\u914d\u5185\u5b58\n    cudaMalloc((void**)&amp;Ad, size);\n    cudaMalloc((void**)&amp;Bd, size);\n    cudaMalloc((void**)&amp;Cd, size);\n\n    \/\/ CPU\u7684\u6570\u636e\u62f7\u8d1d\u5230GPU\u7aef\n    cudaMemcpy(Ad, A, size, cudaMemcpyHostToDevice);\n    cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);\n    cudaMemcpy(Bd, B, size, cudaMemcpyHostToDevice);\n\n    \/\/ \u5b9a\u4e49kernel\u6267\u884c\u914d\u7f6e\uff0c\uff081024*1024\/512\uff09\u4e2ablock\uff0c\u6bcf\u4e2ablock\u91cc\u9762\u6709512\u4e2a\u7ebf\u7a0b\n    dim3 dimBlock(512);\n    dim3 dimGrid(n\/512);\n\n    \/\/ \u6267\u884ckernel\n    Plus&lt;&lt;&lt;dimGrid, dimBlock&gt;&gt;&gt;(Ad, Bd, Cd, n);\n\n    \/\/ \u5c06\u5728GPU\u7aef\u8ba1\u7b97\u597d\u7684\u7ed3\u679c\u62f7\u8d1d\u56deCPU\u7aef\n    cudaMemcpy(C, Cd, size, cudaMemcpyDeviceToHost);\n\n    \/\/ \u6821\u9a8c\u8bef\u5dee\n    float max_error = 0.0;\n    for(int i=0;i&lt;n;i++)\n    {\n        max_error += fabs(100.0 - C&#91;i]);\n    }\n\n    cout &lt;&lt; \"max error is \" &lt;&lt; max_error &lt;&lt; endl;\n\n    \/\/ \u91ca\u653eCPU\u7aef\u3001GPU\u7aef\u7684\u5185\u5b58\n    free(A);\n    free(B);\n    free(C);\n    cudaFree(Ad);\n    cudaFree(Bd);\n    cudaFree(Cd);\n    gettimeofday( &amp;end, NULL );\n    int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;\n    cout &lt;&lt; \"total time is \" &lt;&lt; timeuse\/1000 &lt;&lt; \"ms\" &lt;&lt;endl;\n    return 0;\n}\n<\/code><\/pre>\n\n\n\n<p>GPU\u65b9\u5f0f\u8f93\u51fa\u7ed3\u679c<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>max error is 0\ntotal time is 1278ms\n<\/code><\/pre>\n\n\n\n<p>\u7531\u4e0a\u9762\u7684\u4f8b\u5b50\u770b\u51fa\uff0c\u4f7f\u7528CUDA\u7f16\u7a0b\u65f6\u6211\u4eec\u770b\u4e0d\u5230for\u5faa\u73af\u4e86\uff0c\u56e0\u4e3aCPU\u7f16\u7a0b\u7684\u5faa\u73af\u5df2\u7ecf\u88ab\u5206\u6563\u5230\u5404\u4e2athread\u4e0a\u505a\u4e86\uff0c\u6240\u4ee5\u6211\u4eec\u4e5f\u5c31\u770b\u5230\u4e0d\u5230for\u4e00\u7c7b\u7684\u8bed\u53e5\u3002\u4ece\u7ed3\u679c\u4e0a\u770b\uff0cCPU\u7684\u5faa\u73af\u8ba1\u7b97\u7684\u901f\u5ea6\u6bd4GPU\u8ba1\u7b97\u5feb\u591a\u4e86\uff0c\u539f\u56e0\u5c31\u5728\u4e8eCUDA\u4e2d\u6709\u5927\u91cf\u7684\u5185\u5b58\u62f7\u8d1d\u64cd\u4f5c\uff08\u6570\u636e\u4f20\u8f93\u82b1\u8d39\u4e86\u5927\u91cf\u65f6\u95f4\uff0c\u800c\u8ba1\u7b97\u65f6\u95f4\u5374\u975e\u5e38\u5c11\uff09\uff0c\u5982\u679c\u8ba1\u7b97\u91cf\u6bd4\u8f83\u5c0f\u7684\u8bdd\uff0cCPU\u8ba1\u7b97\u4f1a\u66f4\u5408\u9002\u4e00\u4e9b\u3002<\/p>\n\n\n\n<p>\u4e0b\u9762\u8ba1\u7b97\u4e00\u4e2a\u7a0d\u5fae\u590d\u6742\u7684\u4f8b\u5b50\uff0c\u77e9\u9635\u52a0\u6cd5\uff0c\u5373\u5bf9\u4e24\u4e2a\u77e9\u9635\u5bf9\u5e94\u5750\u6807\u7684\u5143\u7d20\u76f8\u52a0\u540e\u7684\u7ed3\u679c\u5b58\u50a8\u5728\u7b2c\u4e09\u4e2a\u7684\u5bf9\u5e94\u4f4d\u7f6e\u7684\u5143\u7d20\u4e0a\u3002<\/p>\n\n\n\n<p>\u503c\u5f97\u6ce8\u610f\u7684\u662f\uff0c\u8fd9\u4e2a\u8ba1\u7b97\u4efb\u52a1\u6211\u91c7\u7528\u4e86\u4e8c\u7ef4\u6570\u7ec4\u7684\u8ba1\u7b97\u65b9\u5f0f\uff0c\u6ce8\u610f\u4e00\u4e0b\u4e8c\u7ef4\u6570\u7ec4\u5728CUDA\u7f16\u7a0b\u4e2d\u7684\u5199\u6cd5\u3002<\/p>\n\n\n\n<p>CPU\u7248\u672c<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>#include &lt;stdlib.h&gt;\n#include &lt;iostream&gt;\n#include &lt;sys\/time.h&gt;\n#include &lt;math.h&gt;\n\n#define ROWS 1024\n#define COLS 1024\n\nusing namespace std;\n\nint main()\n{\n    struct timeval start, end;\n    gettimeofday( &amp;start, NULL );\n    int *A, **A_ptr, *B, **B_ptr, *C, **C_ptr;\n    int total_size = ROWS*COLS*sizeof(int);\n    A = (int*)malloc(total_size);\n    B = (int*)malloc(total_size);\n    C = (int*)malloc(total_size);\n    A_ptr = (int**)malloc(ROWS*sizeof(int*));\n    B_ptr = (int**)malloc(ROWS*sizeof(int*));\n    C_ptr = (int**)malloc(ROWS*sizeof(int*));\n    \n    \/\/CPU\u4e00\u7ef4\u6570\u7ec4\u521d\u59cb\u5316\n    for(int i=0;i&lt;ROWS*COLS;i++)\n    {\n        A&#91;i] = 80;\n        B&#91;i] = 20;\n    }\n    \n    for(int i=0;i&lt;ROWS;i++)\n    {\n        A_ptr&#91;i] = A + COLS*i;\n        B_ptr&#91;i] = B + COLS*i;\n        C_ptr&#91;i] = C + COLS*i;\n    }\n    \n    for(int i=0;i&lt;ROWS;i++)\n        for(int j=0;j&lt;COLS;j++)\n        {\n            C_ptr&#91;i]&#91;j] = A_ptr&#91;i]&#91;j] + B_ptr&#91;i]&#91;j];\n        }\n        \n    \/\/\u68c0\u67e5\u7ed3\u679c\n    int max_error = 0;\n    for(int i=0;i&lt;ROWS*COLS;i++)\n    {\n        \/\/cout &lt;&lt; C&#91;i] &lt;&lt; endl;\n        max_error += abs(100-C&#91;i]);\n    }\n    \n    cout &lt;&lt; \"max_error is \" &lt;&lt; max_error &lt;&lt;endl;     \n    gettimeofday( &amp;end, NULL );\n    int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;\n    cout &lt;&lt; \"total time is \" &lt;&lt; timeuse\/1000 &lt;&lt; \"ms\" &lt;&lt;endl;\n    \n    return 0;\n}\n<\/code><\/pre>\n\n\n\n<p>CPU\u65b9\u5f0f\u8f93\u51fa<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>max_error is 0\ntotal time is 29ms\n<\/code><\/pre>\n\n\n\n<p>GPU\u7248\u672c<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>#include \"cuda_runtime.h\"\n#include \"device_launch_parameters.h\"\n#include &lt;sys\/time.h&gt; \n#include &lt;stdio.h&gt;\n#include &lt;math.h&gt;\n#define Row  1024\n#define Col 1024\n \n \n__global__ void addKernel(int **C,  int **A, int ** B)\n{\n    int idx = threadIdx.x + blockDim.x * blockIdx.x;\n    int idy = threadIdx.y + blockDim.y * blockIdx.y;\n    if (idx &lt; Col &amp;&amp; idy &lt; Row) {\n        C&#91;idy]&#91;idx] = A&#91;idy]&#91;idx] + B&#91;idy]&#91;idx];\n    }\n}\n \nint main()\n{\n    struct timeval start, end;\n    gettimeofday( &amp;start, NULL );\n\n    int **A = (int **)malloc(sizeof(int*) * Row);\n    int **B = (int **)malloc(sizeof(int*) * Row);\n    int **C = (int **)malloc(sizeof(int*) * Row);\n    int *dataA = (int *)malloc(sizeof(int) * Row * Col);\n    int *dataB = (int *)malloc(sizeof(int) * Row * Col);\n    int *dataC = (int *)malloc(sizeof(int) * Row * Col);\n    int **d_A;\n    int **d_B;\n    int **d_C;\n    int *d_dataA;\n    int *d_dataB;\n    int *d_dataC;\n    \/\/malloc device memory\n    cudaMalloc((void**)&amp;d_A, sizeof(int **) * Row);\n    cudaMalloc((void**)&amp;d_B, sizeof(int **) * Row);\n    cudaMalloc((void**)&amp;d_C, sizeof(int **) * Row);\n    cudaMalloc((void**)&amp;d_dataA, sizeof(int) *Row*Col);\n    cudaMalloc((void**)&amp;d_dataB, sizeof(int) *Row*Col);\n    cudaMalloc((void**)&amp;d_dataC, sizeof(int) *Row*Col);\n    \/\/set value\n    for (int i = 0; i &lt; Row*Col; i++) {\n        dataA&#91;i] = 90;\n        dataB&#91;i] = 10;\n    }\n    \/\/\u5c06\u4e3b\u673a\u6307\u9488A\u6307\u5411\u8bbe\u5907\u6570\u636e\u4f4d\u7f6e\uff0c\u76ee\u7684\u662f\u8ba9\u8bbe\u5907\u4e8c\u7ea7\u6307\u9488\u80fd\u591f\u6307\u5411\u8bbe\u5907\u6570\u636e\u4e00\u7ea7\u6307\u9488\n    \/\/A \u548c  dataA \u90fd\u4f20\u5230\u4e86\u8bbe\u5907\u4e0a\uff0c\u4f46\u662f\u4e8c\u8005\u8fd8\u6ca1\u6709\u5efa\u7acb\u5bf9\u5e94\u5173\u7cfb\n    for (int i = 0; i &lt; Row; i++) {\n        A&#91;i] = d_dataA + Col * i;\n        B&#91;i] = d_dataB + Col * i;\n        C&#91;i] = d_dataC + Col * i;\n    }\n                                                                \n    cudaMemcpy(d_A, A, sizeof(int*) * Row, cudaMemcpyHostToDevice);\n    cudaMemcpy(d_B, B, sizeof(int*) * Row, cudaMemcpyHostToDevice);\n    cudaMemcpy(d_C, C, sizeof(int*) * Row, cudaMemcpyHostToDevice);\n    cudaMemcpy(d_dataA, dataA, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);\n    cudaMemcpy(d_dataB, dataB, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);\n    dim3 threadPerBlock(16, 16);\n    dim3 blockNumber( (Col + threadPerBlock.x - 1)\/ threadPerBlock.x, (Row + threadPerBlock.y - 1) \/ threadPerBlock.y );\n    printf(\"Block(%d,%d)   Grid(%d,%d).\\n\", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);\n    addKernel &lt;&lt; &lt;blockNumber, threadPerBlock &gt;&gt; &gt; (d_C, d_A, d_B);\n    \/\/\u62f7\u8d1d\u8ba1\u7b97\u6570\u636e-\u4e00\u7ea7\u6570\u636e\u6307\u9488\n    cudaMemcpy(dataC, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);\n                                                                                             \n    int max_error = 0;\n    for(int i=0;i&lt;Row*Col;i++)\n    {\n        \/\/printf(\"%d\\n\", dataC&#91;i]);\n        max_error += abs(100-dataC&#91;i]);\n    }\n\n    \/\/\u91ca\u653e\u5185\u5b58\n    free(A);\n    free(B);\n    free(C);\n    free(dataA);\n    free(dataB);\n    free(dataC);\n    cudaFree(d_A);\n    cudaFree(d_B);\n    cudaFree(d_C);\n    cudaFree(d_dataA);\n    cudaFree(d_dataB);\n    cudaFree(d_dataC);\n\n    printf(\"max_error is %d\\n\", max_error);\n    gettimeofday( &amp;end, NULL );\n    int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;\n    printf(\"total time is %d ms\\n\", timeuse\/1000);\n\n    return 0;\n}\n<\/code><\/pre>\n\n\n\n<p>GPU\u8f93\u51fa<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>Block(16,16)   Grid(64,64).\nmax_error is 0\ntotal time is 442 ms\n<\/code><\/pre>\n\n\n\n<p>\u4ece\u7ed3\u679c\u770b\u51fa\uff0cCPU\u8ba1\u7b97\u65f6\u95f4\u8fd8\u662f\u6bd4GPU\u7684\u8ba1\u7b97\u65f6\u95f4\u77ed\u3002\u8fd9\u91cc\u9700\u8981\u6307\u51fa\u7684\u662f\uff0c\u8fd9\u79cd\u4e8c\u7ef4\u6570\u7ec4\u7684\u7a0b\u5e8f\u5199\u6cd5\u7684\u6548\u7387\u5e76\u4e0d\u9ad8\uff08\u867d\u7136\u6bd4\u8f83\u7b26\u5408\u6211\u4eec\u7684\u601d\u7ef4\u65b9\u5f0f\uff09\uff0c\u56e0\u4e3a\u6211\u4eec\u505a\u4e86\u4e24\u6b21\u8bbf\u5b58\u64cd\u4f5c\u3002\u6240\u4ee5\u4e00\u822c\u800c\u8a00\uff0c\u505a\u9ad8\u6027\u80fd\u8ba1\u7b97\u4e00\u822c\u4e0d\u4f1a\u91c7\u53d6\u8fd9\u79cd\u7f16\u7a0b\u65b9\u5f0f\u3002<\/p>\n\n\n\n<p>\u6700\u540e\u4e00\u4e2a\u4f8b\u5b50\u6211\u4eec\u5c06\u8ba1\u7b97\u4e00\u4e2a\u66f4\u52a0\u590d\u6742\u7684\u4efb\u52a1\uff0c\u77e9\u9635\u4e58\u6cd5<\/p>\n\n\n\n<p>\u56de\u987e\u4e00\u4e0b\u77e9\u9635\u4e58\u6cd5\uff1a\u4e24\u77e9\u9635\u76f8\u4e58\uff0c\u5de6\u77e9\u9635\u7b2c\u4e00\u884c\u4e58\u4ee5\u53f3\u77e9\u9635\u7b2c\u4e00\u5217\uff08\u5206\u522b\u76f8\u4e58\uff0c\u7b2c\u4e00\u4e2a\u6570\u4e58\u7b2c\u4e00\u4e2a\u6570\uff09\uff0c\u4e58\u5b8c\u4e4b\u540e\u76f8\u52a0\uff0c\u5373\u4e3a\u7ed3\u679c\u7684\u7b2c\u4e00\u884c\u7b2c\u4e00\u5217\u7684\u6570\uff0c\u4f9d\u6b21\u5f80\u4e0b\u7b97\uff0c\u76f4\u5230\u8ba1\u7b97\u5b8c\u6240\u6709\u77e9\u9635\u5143\u7d20\u3002<\/p>\n\n\n\n<p>CPU\u7248\u672c<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>#include &lt;iostream&gt;\n#include &lt;stdlib.h&gt;\n#include &lt;sys\/time.h&gt;\n\n#define ROWS 1024\n#define COLS 1024\n\nusing namespace std;\n\nvoid matrix_mul_cpu(float* M, float* N, float* P, int width)\n{\n    for(int i=0;i&lt;width;i++)\n        for(int j=0;j&lt;width;j++)\n        {\n            float sum = 0.0;\n            for(int k=0;k&lt;width;k++)\n            {\n                float a = M&#91;i*width+k];\n                float b = N&#91;k*width+j];\n                sum += a*b;\n            }\n            P&#91;i*width+j] = sum;\n        }\n}\n\nint main()\n{\n    struct timeval start, end;\n    gettimeofday( &amp;start, NULL );\n    float *A, *B, *C;\n    int total_size = ROWS*COLS*sizeof(float);\n    A = (float*)malloc(total_size);\n    B = (float*)malloc(total_size);\n    C = (float*)malloc(total_size);\n\n    \/\/CPU\u4e00\u7ef4\u6570\u7ec4\u521d\u59cb\u5316\n    for(int i=0;i&lt;ROWS*COLS;i++)\n    {\n        A&#91;i] = 80.0;\n        B&#91;i] = 20.0;\n    }\n\n    matrix_mul_cpu(A, B, C, COLS);\n\n    gettimeofday( &amp;end, NULL );\n    int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;\n    cout &lt;&lt; \"total time is \" &lt;&lt; timeuse\/1000 &lt;&lt; \"ms\" &lt;&lt;endl;\n\n    return 0;\n}\n<\/code><\/pre>\n\n\n\n<p>CPU\u8f93\u51fa<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>total time is 7617ms\n<\/code><\/pre>\n\n\n\n<p>\u68b3\u7406\u4e00\u4e0bCUDA\u6c42\u89e3\u77e9\u9635\u4e58\u6cd5\u7684\u601d\u8def\uff1a\u56e0\u4e3aC=A\u00d7B\uff0c\u6211\u4eec\u5229\u7528\u6bcf\u4e2a\u7ebf\u7a0b\u6c42\u89e3C\u77e9\u9635\u6bcf\u4e2a(x, y)\u7684\u5143\u7d20\uff0c\u6bcf\u4e2a\u7ebf\u7a0b\u8f7d\u5165A\u7684\u4e00\u884c\u548cB\u7684\u4e00\u5217\uff0c\u904d\u5386\u5404\u81ea\u884c\u5217\u5143\u7d20\uff0c\u5bf9A\u3001B\u5bf9\u5e94\u7684\u5143\u7d20\u505a\u4e00\u6b21\u4e58\u6cd5\u548c\u4e00\u6b21\u52a0\u6cd5\u3002<\/p>\n\n\n\n<p>GPU\u7248\u672c<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>#include \"cuda_runtime.h\"\n#include \"device_launch_parameters.h\"\n#include &lt;sys\/time.h&gt; \n#include &lt;stdio.h&gt;\n#include &lt;math.h&gt;\n#define Row  1024\n#define Col 1024\n\n \n__global__ void matrix_mul_gpu(int *M, int* N, int* P, int width)\n{\n    int i = threadIdx.x + blockDim.x * blockIdx.x;\n    int j = threadIdx.y + blockDim.y * blockIdx.y;\n                \n    int sum = 0;\n    for(int k=0;k&lt;width;k++)\n    {\n        int a = M&#91;j*width+k];\n        int b = N&#91;k*width+i];\n        sum += a*b;\n    }\n    P&#91;j*width+i] = sum;\n}\n \nint main()\n{\n    struct timeval start, end;\n    gettimeofday( &amp;start, NULL );\n\n    int *A = (int *)malloc(sizeof(int) * Row * Col);\n    int *B = (int *)malloc(sizeof(int) * Row * Col);\n    int *C = (int *)malloc(sizeof(int) * Row * Col);\n    \/\/malloc device memory\n    int *d_dataA, *d_dataB, *d_dataC;\n    cudaMalloc((void**)&amp;d_dataA, sizeof(int) *Row*Col);\n    cudaMalloc((void**)&amp;d_dataB, sizeof(int) *Row*Col);\n    cudaMalloc((void**)&amp;d_dataC, sizeof(int) *Row*Col);\n    \/\/set value\n    for (int i = 0; i &lt; Row*Col; i++) {\n        A&#91;i] = 90;\n        B&#91;i] = 10;\n    }\n                                                                \n    cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);\n    cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);\n    dim3 threadPerBlock(16, 16);\n    dim3 blockNumber((Col+threadPerBlock.x-1)\/ threadPerBlock.x, (Row+threadPerBlock.y-1)\/ threadPerBlock.y );\n    printf(\"Block(%d,%d)   Grid(%d,%d).\\n\", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);\n    matrix_mul_gpu &lt;&lt; &lt;blockNumber, threadPerBlock &gt;&gt; &gt; (d_dataA, d_dataB, d_dataC, Col);\n    \/\/\u62f7\u8d1d\u8ba1\u7b97\u6570\u636e-\u4e00\u7ea7\u6570\u636e\u6307\u9488\n    cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);\n                                                                                             \n    \/\/\u91ca\u653e\u5185\u5b58\n    free(A);\n    free(B);\n    free(C);\n    cudaFree(d_dataA);\n    cudaFree(d_dataB);\n    cudaFree(d_dataC);\n\n    gettimeofday( &amp;end, NULL );\n    int timeuse = 1000000 * ( end.tv_sec - start.tv_sec ) + end.tv_usec - start.tv_usec;\n    printf(\"total time is %d ms\\n\", timeuse\/1000);\n\n    return 0;\n}\n<\/code><\/pre>\n\n\n\n<p>GPU\u8f93\u51fa<\/p>\n\n\n\n<pre class=\"wp-block-code\"><code>Block(16,16)   Grid(64,64).\ntotal time is 506 ms\n<\/code><\/pre>\n\n\n\n<p>\u4ece\u8fd9\u4e2a\u77e9\u9635\u4e58\u6cd5\u4efb\u52a1\u53ef\u4ee5\u770b\u51fa\uff0c\u6211\u4eec\u901a\u8fc7GPU\u8fdb\u884c\u5e76\u884c\u8ba1\u7b97\u7684\u65b9\u5f0f\u4ec5\u82b1\u8d39\u4e860.5\u79d2\uff0c\u4f46\u662fCPU\u4e32\u884c\u8ba1\u7b97\u65b9\u5f0f\u5374\u82b1\u8d39\u4e867.6\u79d2\uff0c\u8ba1\u7b97\u901f\u5ea6\u63d0\u5347\u4e86\u5341\u591a\u500d\uff0c\u53ef\u89c1\u5e76\u884c\u8ba1\u7b97\u7684\u5a01\u529b\uff01<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u82f1\u4f1f\u8fbe\u663e\u5361\u7b49\u7ea7\u662f\u6839\u636eCUDA\u8fdb\u884c&#46;&#46;&#46;<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[4],"tags":[],"class_list":["post-2259","post","type-post","status-publish","format-standard","hentry","category-4"],"_links":{"self":[{"href":"https:\/\/sanlangcode.com\/index.php\/wp-json\/wp\/v2\/posts\/2259","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/sanlangcode.com\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/sanlangcode.com\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/sanlangcode.com\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/sanlangcode.com\/index.php\/wp-json\/wp\/v2\/comments?post=2259"}],"version-history":[{"count":0,"href":"https:\/\/sanlangcode.com\/index.php\/wp-json\/wp\/v2\/posts\/2259\/revisions"}],"wp:attachment":[{"href":"https:\/\/sanlangcode.com\/index.php\/wp-json\/wp\/v2\/media?parent=2259"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/sanlangcode.com\/index.php\/wp-json\/wp\/v2\/categories?post=2259"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/sanlangcode.com\/index.php\/wp-json\/wp\/v2\/tags?post=2259"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}