"use strict";(self.webpackChunkelementary_public_docs=self.webpackChunkelementary_public_docs||[]).push([[29019],{15680:(e,l,t)=>{t.d(l,{xA:()=>g,yg:()=>y});var n=t(96540);function a(e,l,t){return l in e?Object.defineProperty(e,l,{value:t,enumerable:!0,configurable:!0,writable:!0}):e[l]=t,e}function r(e,l){var t=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);l&&(n=n.filter((function(l){return Object.getOwnPropertyDescriptor(e,l).enumerable}))),t.push.apply(t,n)}return t}function p(e){for(var l=1;l=0||(a[t]=e[t]);return a}(e,l);if(Object.getOwnPropertySymbols){var r=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,t)&&(a[t]=e[t])}return a}var o=n.createContext({}),i=function(e){var l=n.useContext(o),t=l;return e&&(t="function"==typeof e?e(l):p(p({},l),e)),t},g=function(e){var l=i(e.components);return n.createElement(o.Provider,{value:l},e.children)},s="mdxType",u={inlineCode:"code",wrapper:function(e){var l=e.children;return n.createElement(n.Fragment,{},l)}},d=n.forwardRef((function(e,l){var t=e.components,a=e.mdxType,r=e.originalType,o=e.parentName,g=c(e,["components","mdxType","originalType","parentName"]),s=i(t),d=a,y=s["".concat(o,".").concat(d)]||s[d]||u[d]||r;return t?n.createElement(y,p(p({ref:l},g),{},{components:t})):n.createElement(y,p({ref:l},g))}));function y(e,l){var t=arguments,a=l&&l.mdxType;if("string"==typeof e||a){var r=t.length,p=new Array(r);p[0]=d;var c={};for(var o in l)hasOwnProperty.call(l,o)&&(c[o]=l[o]);c.originalType=e,c[s]="string"==typeof e?e:a,p[1]=c;for(var i=2;i{t.r(l),t.d(l,{assets:()=>o,contentTitle:()=>p,default:()=>u,frontMatter:()=>r,metadata:()=>c,toc:()=>i});var n=t(58168),a=(t(96540),t(15680));const r={sidebar_position:4,product:"\u9ad8\u6027\u80fdAI\u7b97\u529b\u6c60(ACP)"},p="\u3010NGC \u955c\u50cf\u3011nccl-test \u901a\u4fe1\u5e93\u68c0\u6d4b\u6700\u4f73\u5b9e\u8df5",c={unversionedId:"cloud-foundation/compute/acp/acpBestPractices/Job-nccl_test",id:"cloud-foundation/compute/acp/acpBestPractices/Job-nccl_test",title:"\u3010NGC \u955c\u50cf\u3011nccl-test \u901a\u4fe1\u5e93\u68c0\u6d4b\u6700\u4f73\u5b9e\u8df5",description:"\u524d\u7f6e\u51c6\u5907\u5de5\u4f5c",source:"@site/docs/cloud-foundation/compute/acp/acpBestPractices/Job-nccl_test.md",sourceDirName:"cloud-foundation/compute/acp/acpBestPractices",slug:"/cloud-foundation/compute/acp/acpBestPractices/Job-nccl_test",permalink:"/help/docs/cloud-foundation/compute/acp/acpBestPractices/Job-nccl_test",draft:!1,editUrl:"https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/docs/cloud-foundation/compute/acp/acpBestPractices/Job-nccl_test.md",tags:[],version:"current",sidebarPosition:4,frontMatter:{sidebar_position:4,product:"\u9ad8\u6027\u80fdAI\u7b97\u529b\u6c60(ACP)"},sidebar:"tutorialSidebar",previous:{title:"\u63d0\u4ea4\u4e00\u4e2aMPI\u5206\u5e03\u5f0f\u4efb\u52a1",permalink:"/help/docs/cloud-foundation/compute/acp/acpBestPractices/Job-MPI"},next:{title:"\u7f51\u7edc\u8bca\u65ad\u5de5\u5177Network Diagnostic Toolkit",permalink:"/help/docs/cloud-foundation/compute/acp/acpBestPractices/Job-NetworkDiagnosticToolkit"}},o={},i=[{value:"\u524d\u7f6e\u51c6\u5907\u5de5\u4f5c",id:"\u524d\u7f6e\u51c6\u5907\u5de5\u4f5c",level:3},{value:"MPI\u4efb\u52a1_\u5355\u673a\u901a\u4fe1\u5e93\u68c0\u6d4b\uff081\u526f\u672c\uff09",id:"mpi\u4efb\u52a1_\u5355\u673a\u901a\u4fe1\u5e93\u68c0\u6d4b1\u526f\u672c",level:3},{value:"MPI\u4efb\u52a1_\u591a\u673a\u901a\u4fe1\u5e93\u68c0\u6d4b\uff082\u526f\u672c\uff09",id:"mpi\u4efb\u52a1_\u591a\u673a\u901a\u4fe1\u5e93\u68c0\u6d4b2\u526f\u672c",level:3},{value:"\u6d4b\u8bd5\u7ed3\u679c\u89e3\u8bfb",id:"\u6d4b\u8bd5\u7ed3\u679c\u89e3\u8bfb",level:3},{value:"\u5982\u4f55\u4fee\u6539hostfile",id:"\u5982\u4f55\u4fee\u6539hostfile",level:3},{value:"\u64cd\u4f5c\u547d\u4ee4\u89e3\u91ca",id:"\u64cd\u4f5c\u547d\u4ee4\u89e3\u91ca",level:3}],g={toc:i},s="wrapper";function u(e){let{components:l,...r}=e;return(0,a.yg)(s,(0,n.A)({},g,r,{components:l,mdxType:"MDXLayout"}),(0,a.yg)("h1",{id:"ngc-\u955c\u50cfnccl-test-\u901a\u4fe1\u5e93\u68c0\u6d4b\u6700\u4f73\u5b9e\u8df5"},"\u3010NGC \u955c\u50cf\u3011nccl-test \u901a\u4fe1\u5e93\u68c0\u6d4b\u6700\u4f73\u5b9e\u8df5"),(0,a.yg)("h3",{id:"\u524d\u7f6e\u51c6\u5907\u5de5\u4f5c"},"\u524d\u7f6e\u51c6\u5907\u5de5\u4f5c"),(0,a.yg)("ol",null,(0,a.yg)("li",{parentName:"ol"},"\u8d44\u6e90\u914d\u7f6e\u9700\u8981\u533a\u5206\u5f53\u524d\u662f\u5355\u673a\u68c0\u6d4b\u8fd8\u662f\u591a\u673a\u68c0\u6d4b\uff0c\u526f\u672c\u6570\u5bf9\u5e941\u6216\u8005n\u3002")),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(24827).A,width:"1378",height:"657"})),(0,a.yg)("ol",{start:2},(0,a.yg)("li",{parentName:"ol"},"\u521b\u5efa\u3010",(0,a.yg)("a",{parentName:"li",href:"https://docs.open-mpi.org/en/main/man-openmpi/man1/mpirun.1.html"},"MPI"),"\u3011\u68c0\u6d4b\u4efb\u52a1\u65f6\uff0c\u9009\u62e9\u5e26\u3010Base\u3011\u6807\u8bc6\u7684nvidia24.02\u7248\u672c",(0,a.yg)("a",{parentName:"li",href:"https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/index.html"},"NGC\u5b98\u65b9\u955c\u50cf"),"\u3002")),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(62848).A,width:"1516",height:"230"})),(0,a.yg)("ol",{start:3},(0,a.yg)("li",{parentName:"ol"},"\u4e3a\u4e86\u8fbe\u5230\u6700\u4f73\u6027\u80fd\u6d4b\u8bd5\u7ed3\u679c\uff0c\u4e0d\u540c\u7684\u7f51\u7edc\u96c6\u7fa4\u65b9\u6848\u9700\u8981\u6ce8\u5165\u4e0d\u540c\u7684\u73af\u5883\u53d8\u91cf\uff0c\u8be6\u60c5\u53c2\u8003 ",(0,a.yg)("a",{parentName:"li",href:"https://console.sensecore.cn/help/docs/cloud-foundation/compute/acp/acpUserGuide/acpEnvironmentVariable"},"\u901a\u7528\u73af\u5883\u53d8\u91cf")," \u3002")),(0,a.yg)("p",null,"\u4f8b\u5982\u5728RoCE v2 *400G\u7f51\u7edc\u65b9\u6848\u4e0b\uff0c\u70b9\u51fb\u3010\u9ad8\u7ea7\u914d\u7f6e\u3011\uff0c\u914d\u7f6e\u4ee5\u4e0b\u73af\u5883\u53d8\u91cf\uff1a"),(0,a.yg)("pre",null,(0,a.yg)("code",{parentName:"pre"},"NCCL_IB_GID_INDEX\xa0=\xa05\nNCCL_IB_TC\xa0=\xa0138\nNCCL_IB_QPS_PER_CONNECTION\xa0=\xa08\nOMPI_MCA_btl_tcp_if_include\xa0=\xa0eth0\n")),(0,a.yg)("p",null,(0,a.yg)("strong",{parentName:"p"},"\u8865\u5145\u8bf4\u660e\uff1a")),(0,a.yg)("p",null,"\u5982\u679c\u901a\u8fc7nccl-test\u505a\u96c6\u7fa4\u7f51\u7edc\u7684\u57fa\u7ebf\u6d4b\u8bd5\uff0c\u53ef\u4ee5\u8bbe\u7f6e",(0,a.yg)("inlineCode",{parentName:"p"},"NCCL_MIN_NCHANNELS=32"),"\uff0c\u53ef\u4ee5\u5927\u5e45\u5ea6\u63d0\u9ad8\u6d4b\u8bd5\u7ed3\u679c\uff0c\u8fd9\u4e2a\u53c2\u6570\u7528\u4e8e ",(0,a.yg)("strong",{parentName:"p"},"\u4f18\u5316 NCCL \u901a\u4fe1\u5e76\u53d1\u5ea6")," \u7684\u73af\u5883\u53d8\u91cf\u8bbe\u7f6e\uff0c\u63a7\u5236 NCCL \u5185\u90e8\u7528\u4e8e\u5e76\u53d1\u901a\u4fe1\u7684\u6570\u636e\u901a\u9053\uff08channel\uff09\u7684 ",(0,a.yg)("strong",{parentName:"p"},"\u6700\u5c0f\u6570\u91cf"),"\u3002\u4e0d\u8fc7\u5728\u5b9e\u9645\u8bad\u7ec3\u8fc7\u7a0b\u4e2d\uff0c\u901a\u4fe1\u7528\u7684\u8d44\u6e90\u591a\uff0c\u8ba1\u7b97\u53ef\u7528\u8d44\u6e90\u5c31\u4f1a\u53d8\u5c11\uff0c\u7528\u6237\u8981\u6839\u636e\u5b9e\u9645\u60c5\u51b5\u505a\u8c03\u6574\u3002"),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(52712).A,width:"1069",height:"264"})),(0,a.yg)("ol",{start:4},(0,a.yg)("li",{parentName:"ol"},"\u542f\u52a8\u547d\u4ee4\u5199\u6210 ",(0,a.yg)("inlineCode",{parentName:"li"},"sleep inf"),"\uff0c\u8ba9\u5bb9\u5668\u4fdd\u6301\u957f\u8fd0\u884c\u72b6\u6001\uff0c\u65b9\u4fbf\u6267\u884c\u76f8\u5173\u7684\u68c0\u6d4b\u811a\u672c\u3002")),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(886).A,width:"1314",height:"261"})),(0,a.yg)("ol",{start:5},(0,a.yg)("li",{parentName:"ol"},"\u901a\u8fc7\u3010Web Terminal\u3011\u767b\u5f55\u5230 pod \uff0c\u6ce8\u610f\u9009\u62e9\u540d\u79f0\u542b ",(0,a.yg)("strong",{parentName:"li"},"launcher")," \u7684 Worker\u3002")),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(97823).A,width:"1875",height:"548"})),(0,a.yg)("ol",{start:6},(0,a.yg)("li",{parentName:"ol"},"\u8f93\u5165",(0,a.yg)("inlineCode",{parentName:"li"},"ldconfig -p | grep libnccl.so"),"\uff0c\u68c0\u67e5\u7cfb\u7edf\u4e2dNCCL \u5e93\u5b89\u88c5\u8def\u5f84\u53ca\u7248\u672c\u4fe1\u606f\u3002")),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(60964).A,width:"1912",height:"138"})),(0,a.yg)("ol",{start:7},(0,a.yg)("li",{parentName:"ol"},"\u8f93\u5165",(0,a.yg)("inlineCode",{parentName:"li"},"ll /usr/lib/x86_64-linux-gnu/libnccl.so*"),"\uff0c\u67e5\u770b NCCL \u5e93\u6587\u4ef6\u7684\u8be6\u7ec6\u4fe1\u606f\uff0c\u5305\u62ec\u7b26\u53f7\u94fe\u63a5\u5173\u7cfb\u548c\u7248\u672c\u53f7\u3002")),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(14861).A,width:"1877",height:"88"})),(0,a.yg)("ol",{start:8},(0,a.yg)("li",{parentName:"ol"},"\u8f93\u5165",(0,a.yg)("inlineCode",{parentName:"li"},"ll /usr/local/bin/*_perf*"),"\uff0c\u67e5\u770b\u7cfb\u7edf\u4e2d\u4e0e\u6027\u80fd\u6d4b\u8bd5\u76f8\u5173\u7684\u5de5\u5177\u3002")),(0,a.yg)("p",null,"\u6ce8\u610f\uff1aMPI\u4efb\u52a1\u5fc5\u987b\u4f7f\u7528\u540e\u7f00\u4e3a ",(0,a.yg)("strong",{parentName:"p"},"_mpi")," \u7684\u547d\u4ee4\u3002"),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(57082).A,width:"1875",height:"445"})),(0,a.yg)("h3",{id:"mpi\u4efb\u52a1_\u5355\u673a\u901a\u4fe1\u5e93\u68c0\u6d4b1\u526f\u672c"},"MPI\u4efb\u52a1_\u5355\u673a\u901a\u4fe1\u5e93\u68c0\u6d4b\uff081\u526f\u672c\uff09"),(0,a.yg)("p",null,"\u6ce8\uff1a\u5355\u673a1\u526f\u672c8\u5361\uff0cnp\u4e3a1*8\uff0c8\u4e2a\u8fdb\u7a0b\u3002"),(0,a.yg)("p",null,"\u64cd\u4f5c\u6307\u4ee4\uff1a",(0,a.yg)("inlineCode",{parentName:"p"},"mpirun --allow-run-as-root -bind-to none -map-by slot all_reduce_perf_mpi -b 2048M -e 8192M -f 2 -g 1")),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(89571).A,width:"1681",height:"479"})),(0,a.yg)("h3",{id:"mpi\u4efb\u52a1_\u591a\u673a\u901a\u4fe1\u5e93\u68c0\u6d4b2\u526f\u672c"},"MPI\u4efb\u52a1_\u591a\u673a\u901a\u4fe1\u5e93\u68c0\u6d4b\uff082\u526f\u672c\uff09"),(0,a.yg)("p",null,"\u6ce8\uff1a\u591a\u673a2\u526f\u672c16\u5361\uff0cnp\u4e3a2*8\uff0c16\u4e2a\u8fdb\u7a0b\u3002"),(0,a.yg)("p",null,"\u901a\u8fc7 nccl-test \u68c0\u6d4b2*8\u5361\u7684\u96c6\u5408\u901a\u4fe1\u7684\u60c5\u51b5\uff0c\u526f\u672c\u6570\u8bbe\u7f6e\u4e3a2\uff0c\u9009\u62e98\u5361\u7684\u4ea7\u54c1\u89c4\u683c\u3002"),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(1843).A,width:"1954",height:"557"})),(0,a.yg)("p",null,"\u4efb\u52a1\u521b\u5efa\u597d\u540e\uff0c\u5728\u3010\u4efb\u52a1\u8be6\u60c5\u3011\u9875\u7684\u3010Worker\u5217\u8868\u3011tab\u9875\uff0c\u901a\u8fc7\u3010Web Terminal\u3011\u767b\u5f55\u5230pod \uff0c\u6ce8\u610f\u9009\u62e9\u540d\u79f0\u542b ",(0,a.yg)("strong",{parentName:"p"},"launcher")," \u7684 Worker\u3002"),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(62986).A,width:"1659",height:"420"})),(0,a.yg)("p",null,"\u64cd\u4f5c\u6307\u4ee4\uff1a",(0,a.yg)("inlineCode",{parentName:"p"},'mpirun --allow-run-as-root -bind-to none -map-by slot --mca plm_rsh_agent "ssh -p 2222" -mca pml ob1 -mca btl ^openib -mca plm_rsh_num_concurrent 300 -mca routed_radix 600 -mca plm_rsh_no_tree_spawn 1 all_reduce_perf_mpi -b 2048M -e 8192M -f 2 -g 1')),(0,a.yg)("p",null,"2\u53f0\u7684\u6d4b\u8bd5\u7ed3\u679c\u5982\u4e0b\uff1a"),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(67994).A,width:"1891",height:"681"})),(0,a.yg)("h3",{id:"\u6d4b\u8bd5\u7ed3\u679c\u89e3\u8bfb"},"\u6d4b\u8bd5\u7ed3\u679c\u89e3\u8bfb"),(0,a.yg)("p",null,"\u8be6\u60c5\u53ef\u67e5\u9605\u5b98\u65b9\u6587\u6863\uff1a",(0,a.yg)("a",{parentName:"p",href:"https://github.com/NVIDIA/nccl-tests"},"https://github.com/NVIDIA/nccl-tests")),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(89571).A,width:"1681",height:"479"})),(0,a.yg)("ol",null,(0,a.yg)("li",{parentName:"ol"},"\u6570\u636e\u89c4\u6a21\uff1a")),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"size"),"\uff1a\u64cd\u4f5c\u5904\u7406\u7684\u6570\u636e\u7684\u5927\u5c0f\uff0c\u4ee5\u5b57\u8282\u4e3a\u5355\u4f4d\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"count"),"\uff1a\u64cd\u4f5c\u5904\u7406\u7684\u5143\u7d20\u7684\u6570\u91cf\u3002count \u8d8a\u5927\uff0c\u5bf9\u603b\u7ebf\u5e26\u5bbd\u3001\u7f51\u7edc\u5e26\u5bbd\u7684\u9700\u6c42\u8d8a\u9ad8\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"type"),"\uff1a\u5143\u7d20\u7684\u6570\u636e\u7c7b\u578b\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"redop"),"\uff1a\u4f7f\u7528\u7684\u5f52\u7ea6\u64cd\u4f5c\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"root"),"\uff1a\u5bf9\u4e8e\u67d0\u4e9b\u64cd\u4f5c\uff08\u5982 reduce \u548c broadcast\uff09\uff0c\u8fd9\u5217\u6307\u5b9a\u4e86\u6839\u8282\u70b9\u7684\u7f16\u53f7\u3002-1\u8868\u793a\u8fd9\u4e2a\u64cd\u4f5c\u6ca1\u6709\u6839\u8282\u70b9\uff08\u56e0\u4e3a all-reduce \u64cd\u4f5c\u6d89\u53ca\u6240\u6709\u8282\u70b9\uff09\u3002"),(0,a.yg)("ol",{start:2},(0,a.yg)("li",{parentName:"ol"},"\u901a\u4fe1\u7c7b\u578b\uff1a")),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"out-of-place"),"\uff1a\u64cd\u4f5c\u7ed3\u679c\u5b58\u50a8\u5728\u65b0\u5206\u914d\u7684\u5185\u5b58\u533a\u57df\uff0c\u9700\u5c06\u539f\u6570\u636e\u590d\u5236\u5230\u65b0\u4f4d\u7f6e\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"in-place"),"\uff1a\u76f4\u63a5\u5728\u539f\u5185\u5b58\u5730\u5740\u4e0a\u4fee\u6539\u6570\u636e\uff0c\u65e0\u9700\u989d\u5916\u5185\u5b58\u5206\u914d\u3002"),(0,a.yg)("ol",{start:3},(0,a.yg)("li",{parentName:"ol"},"\u6027\u80fd\u6307\u6807\uff1a")),(0,a.yg)("p",null,"\u6ce8\u610f\uff1a\u5404\u4e2a\u6307\u6807\u5747\u4e3a\u5bf9\u5e94\u6570\u636e\u5927\u5c0fsize\u4e0b\uff0c20 \u6b21\u8fed\u4ee3\uff08iters=20\uff09\u7ed3\u679c\u7684\u5e73\u5747\u503c\uff0c\u8fed\u4ee3\u6b21\u6570\u53ef\u901a\u8fc7-n\u53c2\u6570\u8c03\u6574\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"time (us)"),"\uff1a\u5355\u6b21\u64cd\u4f5c\u4ece\u5f00\u59cb\u5230\u7ed3\u675f\u7684\u8017\u65f6\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"algbw (GB/s)"),"\uff1a\u7b97\u6cd5\u5b9e\u9645\u8fbe\u5230\u7684\u6570\u636e\u4f20\u8f93\u5e26\u5bbd\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"busbw (GB/s)"),"\uff1a\u603b\u7ebf\u5e26\u5bbd\uff08\u7406\u8bba\u5cf0\u503c\uff09\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"#wrong"),"\uff1a\u9519\u8bef\u6570\uff08\u82e5\u975e0\uff0c\u53ef\u80fd\u8868\u793a\u6709\u4e00\u4e9b\u9519\u8bef\u53d1\u751f\uff09\u3002"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"Avg bus bandwidth"),"\uff1a\u5e73\u5747\u603b\u7ebf\u5e26\u5bbd\u3002 "),(0,a.yg)("h3",{id:"\u5982\u4f55\u4fee\u6539hostfile"},"\u5982\u4f55\u4fee\u6539hostfile"),(0,a.yg)("p",null,"\u65b0\u5efa\u4e00\u4e2a8\u526f\u672c\uff0c\u6bcf\u4e2a\u526f\u672c4\u5f20GPU\u5361\u4efb\u52a1\uff0c\u603b\u5171\u67098*4=32\u5f20\u5361\uff0c\u6267\u884c\u5982\u4e0b\u547d\u4ee4\uff0c\u4f1a\u770b\u5230rank 0 ~ rank31\u7684\u8bbe\u5907\u4fe1\u606f\u3002 "),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},'mpirun --allow-run-as-root -bind-to none -map-by slot --mca plm_rsh_agent "ssh -p 2222" -mca pml ob1 -mca btl ^openib -mca plm_rsh_num_concurrent 300 -mca routed_radix 600 -mca plm_rsh_no_tree_spawn 1 all_reduce_perf_mpi -b 2048M -e 8192M -f 2 -g 1')," "),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(5079).A,width:"1102",height:"693"})," "),(0,a.yg)("p",null,"mpirun\u9ed8\u8ba4\u7684hostfile\u6587\u4ef6\u4f4d\u7f6e\u4e3a",(0,a.yg)("inlineCode",{parentName:"p"},"/etc/mpi/hostfile"),"\uff0c\u5982\u679c\u60f3\u8981\u4fee\u6539\u8282\u70b9\u5bf9\u5e94\u7684hostip\u6216\u8005\u6307\u5b9a\u8282\u70b9\u6d4b\u8bd5\uff08\u6bd4\u59828\u526f\u672c\u7684\u4efb\u52a1\uff0c\u6307\u5b9a\u5176\u4e2d4\u4e2a\u8282\u70b9\u6216\u8005\u6307\u5b9a\u5f02\u5e38\u8282\u70b9\u3002 "),(0,a.yg)("p",null,"\u56e0\u4e3a\u5b98\u65b9\u7684",(0,a.yg)("inlineCode",{parentName:"p"},"/etc/mpi/hostfile")," \u4e3aread only\uff0c\u590d\u5236\u4e00\u4efdhostfile\u8fdb\u884c\u7f16\u8f91\uff1a",(0,a.yg)("inlineCode",{parentName:"p"},"mkdir /etc/nccl-test && cat /etc/mpi/hostfile > /etc/nccl-test/mpihostfile")," \uff0c\u7136\u540e\u6267\u884c\uff1a",(0,a.yg)("inlineCode",{parentName:"p"},"vim /etc/nccl-test/mpihostfile "),"\u3002\u6ce8\u91ca\u63892\u30014\u30016\u30018\u884c\uff0c\u5982 \u4e0b\u56fe\u6240\u793a\uff0c\u8fd9\u65f6\u8dd1\u6d4b\u8bd5\u53ea\u67094*4=16\u4e2aGPU\u53c2\u4e0e\u6d4b\u8bd5\u3002 "),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(59502).A,width:"752",height:"201"})," "),(0,a.yg)("p",null,"\u6307\u5b9a\u5176\u4ed6\u4f4d\u7f6e\u7684hostfile\u53ef\u4f7f\u7528",(0,a.yg)("inlineCode",{parentName:"p"},"--hostfile"),"\u53c2\u6570\uff0c\u793a\u4f8b\u4e3a\uff1a",(0,a.yg)("inlineCode",{parentName:"p"},"--hostfile /etc/nccl-test/mpihostfile"),"\u3002\u6267\u884c\u5982\u4e0b\u547d\u4ee4\uff0c\u4f1a\u770b\u5230rank 0 ~ rank15\u7684\u8bbe\u5907\u4fe1\u606f\u3002 "),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},'mpirun --allow-run-as-root -bind-to none -map-by slot --mca plm_rsh_agent "ssh -p 2222" -mca pml ob1 -mca btl ^openib -mca plm_rsh_num_concurrent 300 -mca routed_radix 600 -mca plm_rsh_no_tree_spawn 1 --hostfile /etc/nccl-test/mpihostfile all_reduce_perf_mpi -b 2048M -e 8192M -f 2 -g 1')," "),(0,a.yg)("p",null,(0,a.yg)("img",{src:t(29861).A,width:"1893",height:"557"})," "),(0,a.yg)("h3",{id:"\u64cd\u4f5c\u547d\u4ee4\u89e3\u91ca"},"\u64cd\u4f5c\u547d\u4ee4\u89e3\u91ca"),(0,a.yg)("table",null,(0,a.yg)("tr",null,(0,a.yg)("th",{width:"25%"},"\u64cd\u4f5c\u6307\u4ee4"),(0,a.yg)("th",{width:"50%"},"\u89e3\u91ca"),(0,a.yg)("th",{colspan:"3",width:"25%"},"\u5907\u6ce8")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"ldconfig -p | grep libnccl.so"),(0,a.yg)("td",null,"\u68c0\u67e5\u7cfb\u7edf\u4e2d\u662f\u5426\u5df2\u5b89\u88c5\xa0NCCL \u5e93\xa0\u4ee5\u53ca\u5176\u7248\u672c\u4fe1\u606f"),(0,a.yg)("td",{colspan:"3"})),(0,a.yg)("tr",null,(0,a.yg)("td",null,"ll /usr/lib/x86_64-linux-gnu/libnccl.so*"),(0,a.yg)("td",null,"\u67e5\u770b NCCL \u5e93\u6587\u4ef6\u7684\u8be6\u7ec6\u4fe1\u606f\uff0c\u5305\u62ec\u7b26\u53f7\u94fe\u63a5\u5173\u7cfb\u548c\u7248\u672c\u53f7"),(0,a.yg)("td",{colspan:"3"})),(0,a.yg)("tr",null,(0,a.yg)("td",null,"ll /usr/local/bin/*_perf*"),(0,a.yg)("td",null,"\u67e5\u770b\u7cfb\u7edf\u4e2d\u4e0e\u6027\u80fd\u6d4b\u8bd5\u76f8\u5173\u7684\u5de5\u5177\u3002\u6bcf\u4e2a\u6587\u4ef6\u5bf9\u5e94\u4e00\u79cd\u96c6\u4f53\u901a\u4fe1\u64cd\u4f5c\uff0c\u7528\u4e8e\u6d4b\u8bd5\u7279\u5b9a\u64cd\u4f5c\u7684\u5e26\u5bbd\u3001\u5ef6\u8fdf\u7b49\u6307\u6807"),(0,a.yg)("td",null,"\u8be5\u547d\u4ee4\u7ed3\u679c\u8bf4\u660e\u89c1\u540e\u8868")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"--hostfile /etc/mpi/hostfile"),(0,a.yg)("td",null,"\u6307\u5b9a\u4e3b\u673a\u6587\u4ef6\u8def\u5f84"),(0,a.yg)("td",{colspan:"3",rowspan:"3"},"\u8282\u70b9\u4e0e\u8fdb\u7a0b\u914d\u7f6e")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"--allow-run-as-root"),(0,a.yg)("td",null,"\u5141\u8bb8\u4ee5 root \u7528\u6237\u8eab\u4efd\u8fd0\u884c MPI \u4efb\u52a1")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-np 16 "),(0,a.yg)("td",null,"\u6307\u5b9a\u8981\u8fd0\u884c\u7684\u8fdb\u7a0b\u6570,\u5e94\u4e0e\u8981\u4f7f\u7528\u7684\u603b GPU \u6570\u91cf\u76f8\u5339\u914d\u3002\u5982 2 \u4e2a\u526f\u672c\uff0c\u6bcf\u53f0\u6709 8 \u4e2a GPU\uff0c\u5219\u4e3a-np 16")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-bind-to none"),(0,a.yg)("td",null,"\u4e0d\u5c06\u8fdb\u7a0b\u7ed1\u5b9a\u5230\u7279\u5b9a CPU\uff0c\u4eb2\u548c\u6027\u8bbe\u7f6e\u4e3anone\uff0c\u53ef\u4ee5\u53bb\u6389"),(0,a.yg)("td",{colspan:"3",rowspan:"2"},"\u8fdb\u7a0b\u7ed1\u5b9a\u4e0e\u6620\u5c04\u7b56\u7565")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-map-by slot"),(0,a.yg)("td",null,"\u8868\u793a\u4efb\u52a1\u4f1a\u6309\u7167 slot \u7684\u987a\u5e8f\u5206\u914d\u5230\u8282\u70b9\u4e0a")),(0,a.yg)("tr",null,(0,a.yg)("td",null,'--mca plm_rsh_agent "ssh -p 2222"'),(0,a.yg)("td",null,"\u4f7f\u7528 SSH \u901a\u8fc7\u7aef\u53e3 2222 \u8fdb\u884c\u8282\u70b9\u95f4\u901a\u4fe1"),(0,a.yg)("td",{colspan:"3",rowspan:"6"},"\u5e95\u5c42\u901a\u4fe1\u5e93\u914d\u7f6e\uff08MCA \u53c2\u6570\uff09")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-mca pml ob1"),(0,a.yg)("td",null,"\u4f7f\u7528OpenMPI BTL\u4f5c\u4e3a\u6d88\u606f\u4f20\u9012\u5c42")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-mca btl ^openib"),(0,a.yg)("td",null,"\u7981\u7528 InfiniBand \u7f51\u7edc\uff08\u4f7f\u7528\u4ee5\u592a\u7f51\u66ff\u4ee3\uff09")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-mca plm_rsh_num_concurrent 300"),(0,a.yg)("td",null,"\u5141\u8bb8 300 \u4e2a\u5e76\u53d1 SSH \u8fde\u63a5")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-mca routed_radix 600"),(0,a.yg)("td",null,"\u8def\u7531\u57fa\u6570\u8bbe\u7f6e\u4e3a 600\uff08\u5f71\u54cd\u5927\u89c4\u6a21\u96c6\u7fa4\u901a\u4fe1\u62d3\u6251\uff09")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-mca plm_rsh_no_tree_spawn 1"),(0,a.yg)("td",null,"\u7981\u7528\u6811\u5f62 Spawn \u4f18\u5316\uff08\u9002\u7528\u4e8e\u7279\u6b8a\u7f51\u7edc\u73af\u5883\uff09")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-b 2048M"),(0,a.yg)("td",null,"\u6700\u5c0f\u6d4b\u8bd5\u6570\u636e\u91cf\u4e3a 2048MB\uff082GB\uff09"),(0,a.yg)("td",{colspan:"3",rowspan:"4"},"\u6027\u80fd\u6d4b\u8bd5\u53c2\u6570")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-e 8192M"),(0,a.yg)("td",null,"\u6700\u5927\u6d4b\u8bd5\u6570\u636e\u91cf\u4e3a 8192MB\uff088GB\uff09")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-f 2"),(0,a.yg)("td",null,"\u6570\u636e\u91cf\u6309 2 \u7684\u5e42\u6b21\u9012\u589e\uff082GB \u2192 4GB \u2192 8GB\uff09")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"-g 1"),(0,a.yg)("td",null,"\u6bcf\u4e2a\u8fdb\u7a0b\u4f7f\u7528 1 \u4e2a GPU"))),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"ll /usr/local/bin/*_perf*"),"\u547d\u4ee4\u7ed3\u679c\u8bf4\u660e\uff1a"),(0,a.yg)("table",null,(0,a.yg)("tr",null,(0,a.yg)("th",{width:"25%"},"\u6587\u4ef6\u540d "),(0,a.yg)("th",{width:"25%"},"\u6d4b\u8bd5\u7684\u901a\u4fe1\u64cd\u4f5c"),(0,a.yg)("th",{width:"50%"},"\u8bf4\u660e ")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"all_gather_perf "),(0,a.yg)("td",null,"\u5168\u6536\u96c6\uff08AllGather\uff09"),(0,a.yg)("td",null,"\u8be5\u64cd\u4f5c\u4e2d\uff0c\u6bcf\u4e2a\u8282\u70b9\u90fd\u6709\u4e00\u4e2a\u503c\uff0c\u8fd9\u4e9b\u503c\u88ab\u6536\u96c6\u5230\u4e00\u4e2a\u5217\u8868\u4e2d\uff0c\u7136\u540e\u8fd9\u4e2a\u5217\u8868\u88ab\u53d1\u9001\u56de\u6240\u6709\u7684\u8282\u70b9\u3002")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"all_reduce_perf "),(0,a.yg)("td",null,"\u5f52\u7ea6\u540e\u5e7f\u64ad\uff08AllReduce\uff09"),(0,a.yg)("td",null,"\u8be5\u64cd\u4f5c\u4e2d\uff0c\u6240\u6709\u7684\u8282\u70b9\u90fd\u6709\u4e00\u4e2a\u8f93\u5165\u503c\uff0c\u8fd9\u4e9b\u503c\u88ab\u5f52\u7ea6\uff08\u5982\u6c42\u548c\u6216\u6c42\u6700\u5927\u503c\uff09\u4e3a\u4e00\u4e2a\u5355\u4e00\u503c\uff0c\u7136\u540e\u8be5\u503c\u88ab\u53d1\u9001\u56de\u6240\u6709\u7684\u8282\u70b9\u3002")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"alltoall_perf"),(0,a.yg)("td",null,"\u5168\u4ea4\u6362\uff08AllToAll\uff09"),(0,a.yg)("td",null,"\u8be5\u64cd\u4f5c\u4e2d\uff0c\u6bcf\u4e2a\u8282\u70b9\u90fd\u53d1\u9001\u4e00\u4e2a\u503c\u7ed9\u6240\u6709\u5176\u4ed6\u7684\u8282\u70b9\uff0c\u5e76\u4ece\u6240\u6709\u5176\u4ed6\u7684\u8282\u70b9\u63a5\u6536\u4e00\u4e2a\u503c\u3002")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"broadcast_perf"),(0,a.yg)("td",null,"\u5e7f\u64ad\uff08Broadcast\uff09"),(0,a.yg)("td",null,"\u8be5\u64cd\u4f5c\u4e2d\uff0c\u6bcf\u4e2a\u8282\u70b9\u6709\u4e00\u4e2a\u503c\uff0c\u7136\u540e\u8fd9\u4e2a\u503c\u88ab\u53d1\u9001\u5230\u6240\u6709\u5176\u4ed6\u7684\u8282\u70b9\u3002")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"gather_perf"),(0,a.yg)("td",null,"\u6536\u96c6\uff08Gather\uff09"),(0,a.yg)("td",null,"\u8be5\u64cd\u4f5c\u4e2d\uff0c\u6bcf\u4e2a\u8282\u70b9\u90fd\u6709\u4e00\u4e2a\u503c\uff0c\u8fd9\u4e9b\u503c\u88ab\u6536\u96c6\u5230\u4e00\u4e2a\u5217\u8868\u4e2d\uff0c\u7136\u540e\u8be5\u5217\u8868\u88ab\u53d1\u9001\u5230\u4e00\u4e2a\u6307\u5b9a\u7684\u8282\u70b9\u3002")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"reduce_perf"),(0,a.yg)("td",null,"\u5f52\u7ea6\uff08Reduce\uff09"),(0,a.yg)("td",null,"\u8be5\u64cd\u4f5c\u4e2d\uff0c\u6240\u6709\u8282\u70b9\u90fd\u6709\u4e00\u4e2a\u8f93\u5165\u503c\uff0c\u8fd9\u4e9b\u503c\u88ab\u5f52\u7ea6\u6210\u4e00\u4e2a\u5355\u4e00\u7684\u503c\uff0c\u7136\u540e\u8be5\u503c\u88ab\u53d1\u9001\u5230\u4e00\u4e2a\u6307\u5b9a\u7684\u8282\u70b9\u3002")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"reduce_scatter_perf"),(0,a.yg)("td",null,"\u5f52\u7ea6\u6563\u5c04\uff08ReduceScatter\uff09"),(0,a.yg)("td",null,"\u8be5\u64cd\u4f5c\u4e2d\uff0c\u6240\u6709\u8282\u70b9\u90fd\u6709\u4e00\u4e2a\u8f93\u5165\u503c\uff0c\u8fd9\u4e9b\u503c\u88ab\u5f52\u7ea6\u6210\u4e00\u4e2a\u5355\u4e00\u7684\u503c\uff0c\u7136\u540e\u8fd9\u4e2a\u503c\u88ab\u5206\u6563\u5230\u6240\u6709\u7684\u8282\u70b9\u3002")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"scatter_perf"),(0,a.yg)("td",null,"\u6563\u5c04\uff08Scatter\uff09"),(0,a.yg)("td",null,"\u8be5\u64cd\u4f5c\u4e2d\uff0c\u4e00\u4e2a\u8282\u70b9\u6709\u4e00\u4e2a\u5217\u8868\u7684\u503c\uff0c\u7136\u540e\u8fd9\u4e9b\u503c\u88ab\u5206\u6563\u5230\u6240\u6709\u5176\u4ed6\u7684\u8282\u70b9\u3002")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"sendrecv_perf"),(0,a.yg)("td",null,"\u70b9\u5bf9\u70b9\u53d1\u9001\u63a5\u6536\uff08Send/Recv\uff09"),(0,a.yg)("td",null,"\u8be5\u64cd\u4f5c\u4e2d\uff0c\u4e00\u4e2a\u8282\u70b9\u6709\u4e00\u4e2a\u5305\u542b\u591a\u4e2a\u503c\u7684\u5217\u8868\uff0c\u7136\u540e\u5176\u4e2d\u7684\u503c\u88ab\u5206\u6563\u5230\u6240\u6709\u5176\u4ed6\u7684\u8282\u70b9\u3002")),(0,a.yg)("tr",null,(0,a.yg)("td",null,"hypercube_perf"),(0,a.yg)("td",null,"\u8d85\u7acb\u65b9\u4f53\u901a\u4fe1"),(0,a.yg)("td",null,"\u5728 hypercube\u901a\u4fe1\u6a21\u5f0f\u4e2d\uff0c\u8282\u70b9\u88ab\u7ec4\u7ec7\u6210\u4e00\u4e2a\u8d85\u7acb\u65b9\u4f53\u7684\u7ed3\u6784\uff0c\u7136\u540e\u5728\u8fd9\u4e2a\u7ed3\u6784\u4e2d\u8fdb\u884c\u901a\u4fe1\u3002"))),(0,a.yg)("p",null,(0,a.yg)("strong",{parentName:"p"},"\u5355\u673a\u64cd\u4f5c\u6307\u4ee4\u8865\u5145\u8bf4\u660e\uff1a")),(0,a.yg)("p",null,"\u5355\u673a\u60c5\u51b5\u4e0b\uff0c\u4e0d\u9700\u8981\u6dfb\u52a0\u4e0e\u591a\u673a\u76f8\u5173\u7684\u7279\u5b9a\u6307\u4ee4\u53c2\u6570\uff0c\u6837\u4f8b\u5982\u4e0b\uff1a"),(0,a.yg)("p",null,(0,a.yg)("inlineCode",{parentName:"p"},"mpirun --allow-run-as-root -bind-to none -map-by slot all_reduce_perf_mpi -b 2048M -e 8192M -f 2 -g 1")),(0,a.yg)("p",null,"\u53bb\u9664\u4e0e\u591a\u673a\u76f8\u5173\u7684\u7279\u5b9a\u53c2\u6570",(0,a.yg)("inlineCode",{parentName:"p"},"-mac"),"\uff1a"),(0,a.yg)("p",null,"\u50cf",(0,a.yg)("inlineCode",{parentName:"p"},'-mca plm_rsh_agent "ssh -p 2222"'),"\u8fd9\u7c7b\u7528\u4e8e\u591a\u673a\u95f4\u901a\u8fc7 SSH \u7b49\u65b9\u5f0f\u901a\u4fe1\u7684\u53c2\u6570\uff0c\u5355\u673a\u6d4b\u8bd5\u65f6\u53ef\u53bb\u9664\uff0c\u56e0\u4e3a\u65e0\u9700\u8de8\u673a\u901a\u4fe1\u3002\u8fd8\u6709",(0,a.yg)("inlineCode",{parentName:"p"},"--mca plm_rsh_num_concurrent"),"\u3001",(0,a.yg)("inlineCode",{parentName:"p"},"--mca routed_radix"),"\u3001",(0,a.yg)("inlineCode",{parentName:"p"},"--mca plm_rsh_no_tree_spawn"),"\u7b49\u4e0e\u591a\u673a\u8d44\u6e90\u8c03\u5ea6\u548c\u8fdc\u7a0b Shell \u76f8\u5173\u53c2\u6570\uff0c\u4e00\u822c\u4e5f\u53ef\u5220\u9664\u3002"))}u.isMDXComponent=!0},24827:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test1-9b94d59d2512715771e798ac1aab0cc3.png"},1843:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test10-dad155dc7ab9423b118480b78eef6a62.png"},62986:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test11-4cddb762a9903e6421f7852648f96571.png"},5079:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test14-12bba8cd1eb03f1f698d00f797c6ae24.png"},59502:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test15-ce646646e94bcd53405ffa63557d7884.png"},29861:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test16-8b9c7011f07c94a760b2e34cee0e162c.png"},62848:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test2-b9161abf050f91eea0beb67dd3d5934f.png"},886:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test4-dd9d14c57b5db290d6714c558c3d30dc.png"},97823:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test5-ee9dd68afaee2f15a6cbbcb47cac4d67.png"},60964:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test6-bbb815ec30754bb65b12ff6bdfe17aad.png"},14861:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test7-fa439bac01f762a22cb818dbe940df2d.png"},57082:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccl-test8-11bf94a7f3e115e9b82cb8ddfc1fe00a.png"},52712:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccltest01-bea7ff42f111bdd3c2a688f4f3ac12f8.png"},89571:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccltest02-902e15a0d671901410367014a5cf0c5f.png"},67994:(e,l,t)=>{t.d(l,{A:()=>n});const n=t.p+"assets/images/nccltest03-51d56833dcd3800e92f1eb32d0cc0f60.png"}}]);