"use strict";(self.webpackChunkelementary_public_docs=self.webpackChunkelementary_public_docs||[]).push([[39901],{15680:(e,n,r)=>{r.d(n,{xA:()=>u,yg:()=>y});var t=r(96540);function o(e,n,r){return n in e?Object.defineProperty(e,n,{value:r,enumerable:!0,configurable:!0,writable:!0}):e[n]=r,e}function a(e,n){var r=Object.keys(e);if(Object.getOwnPropertySymbols){var t=Object.getOwnPropertySymbols(e);n&&(t=t.filter((function(n){return Object.getOwnPropertyDescriptor(e,n).enumerable}))),r.push.apply(r,t)}return r}function p(e){for(var n=1;n=0||(o[r]=e[r]);return o}(e,n);if(Object.getOwnPropertySymbols){var a=Object.getOwnPropertySymbols(e);for(t=0;t=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(o[r]=e[r])}return o}var l=t.createContext({}),i=function(e){var n=t.useContext(l),r=n;return e&&(r="function"==typeof e?e(n):p(p({},n),e)),r},u=function(e){var n=i(e.components);return t.createElement(l.Provider,{value:n},e.children)},s="mdxType",m={inlineCode:"code",wrapper:function(e){var n=e.children;return t.createElement(t.Fragment,{},n)}},d=t.forwardRef((function(e,n){var r=e.components,o=e.mdxType,a=e.originalType,l=e.parentName,u=c(e,["components","mdxType","originalType","parentName"]),s=i(r),d=o,y=s["".concat(l,".").concat(d)]||s[d]||m[d]||a;return r?t.createElement(y,p(p({ref:n},u),{},{components:r})):t.createElement(y,p({ref:n},u))}));function y(e,n){var r=arguments,o=n&&n.mdxType;if("string"==typeof e||o){var a=r.length,p=new Array(a);p[0]=d;var c={};for(var l in n)hasOwnProperty.call(n,l)&&(c[l]=n[l]);c.originalType=e,c[s]="string"==typeof e?e:o,p[1]=c;for(var i=2;i{r.r(n),r.d(n,{assets:()=>l,contentTitle:()=>p,default:()=>m,frontMatter:()=>a,metadata:()=>c,toc:()=>i});var t=r(58168),o=(r(96540),r(15680));const a={sidebar_position:3,product:"\u9ad8\u6027\u80fdAI\u7b97\u529b\u6c60(ACP)"},p="\u63d0\u4ea4\u4e00\u4e2aMPI\u5206\u5e03\u5f0f\u4efb\u52a1",c={unversionedId:"cloud-foundation/compute/acp/acpBestPractices/Job-MPI",id:"cloud-foundation/compute/acp/acpBestPractices/Job-MPI",title:"\u63d0\u4ea4\u4e00\u4e2aMPI\u5206\u5e03\u5f0f\u4efb\u52a1",description:"\u5bf9\u4e8eMPI\u63d0\u4ea4\u7684\u5206\u5e03\u5f0f\u4efb\u52a1\uff0c\u6211\u4eec\u4f1a\u521b\u5efa\u4e00\u4e2a\u540e\u7f00\u4e3aLauncher\u7684Pod\u548c\u591a\u4e2a\u540e\u7f00\u4e3aWorker\u7684Pod\uff0c\u5206\u522b\u8868\u793aMPI\u4efb\u52a1\u7684\u542f\u52a8\u8282\u70b9\u548c\u6267\u884c\u8282\u70b9\uff0c\u5f53\u7136Launcher\u5373\u662f\u542f\u52a8\u8282\u70b9\u4e5f\u662f\u6267\u884c\u8282\u70b9\u3002\u4e0ePytorch DDP\u7684\u65b9\u5f0f\u4e0d\u540c\uff0cmpirun\u6ca1\u6709\u63d0\u4f9bmasteraddr\u548cmasterport\uff0c\u800c\u662f\u7531MPI\u7684\u901a\u4fe1\u673a\u5236\u5efa\u7acb\u5bb9\u5668\u4e4b\u95f4\u7684\u8fdb\u7a0b\u62d3\u6251\u5173\u7cfb\u3002\u56e0\u6b64MPI\u5206\u5e03\u5f0f\u4efb\u52a1\u7684\u542f\u52a8\u547d\u4ee4\u5fc5\u987b\u662f\u5982mpirun\u7684\u542f\u52a8\u547d\u4ee4\u6216\u8005\u5305\u542bmpirun\u542f\u52a8\u547d\u4ee4\u7684\u811a\u672c\u3002\u5e38\u89c1\u7684MPI\u5206\u5e03\u5f0f\u8bad\u7ec3\u4efb\u52a1\u6709\uff1a",source:"@site/docs/cloud-foundation/compute/acp/acpBestPractices/Job-MPI.md",sourceDirName:"cloud-foundation/compute/acp/acpBestPractices",slug:"/cloud-foundation/compute/acp/acpBestPractices/Job-MPI",permalink:"/help/docs/cloud-foundation/compute/acp/acpBestPractices/Job-MPI",draft:!1,editUrl:"https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/docs/cloud-foundation/compute/acp/acpBestPractices/Job-MPI.md",tags:[],version:"current",sidebarPosition:3,frontMatter:{sidebar_position:3,product:"\u9ad8\u6027\u80fdAI\u7b97\u529b\u6c60(ACP)"},sidebar:"tutorialSidebar",previous:{title:"\u3010\u5feb\u901f\u5f00\u59cb\u3011\u5fae\u8c03Llama-3-8B-Instruct\u6a21\u578b\u6700\u4f73\u5b9e\u8df5",permalink:"/help/docs/cloud-foundation/compute/acp/acpBestPractices/Job-QuickStart-Llama3-8B"},next:{title:"\u4e91\u5bb9\u5668\u5b9e\u4f8b CCI",permalink:"/help/docs/cloud-foundation/compute/cci/"}},l={},i=[{value:"UI\u754c\u9762\u63d0\u4ea4MPI\u4efb\u52a1\u793a\u4f8b",id:"ui\u754c\u9762\u63d0\u4ea4mpi\u4efb\u52a1\u793a\u4f8b",level:4}],u={toc:i},s="wrapper";function m(e){let{components:n,...a}=e;return(0,o.yg)(s,(0,t.A)({},u,a,{components:n,mdxType:"MDXLayout"}),(0,o.yg)("h1",{id:"\u63d0\u4ea4\u4e00\u4e2ampi\u5206\u5e03\u5f0f\u4efb\u52a1"},"\u63d0\u4ea4\u4e00\u4e2aMPI\u5206\u5e03\u5f0f\u4efb\u52a1"),(0,o.yg)("p",null,"\u5bf9\u4e8eMPI\u63d0\u4ea4\u7684\u5206\u5e03\u5f0f\u4efb\u52a1\uff0c\u6211\u4eec\u4f1a\u521b\u5efa\u4e00\u4e2a\u540e\u7f00\u4e3aLauncher\u7684Pod\u548c\u591a\u4e2a\u540e\u7f00\u4e3aWorker\u7684Pod\uff0c\u5206\u522b\u8868\u793aMPI\u4efb\u52a1\u7684\u542f\u52a8\u8282\u70b9\u548c\u6267\u884c\u8282\u70b9\uff0c\u5f53\u7136Launcher\u5373\u662f\u542f\u52a8\u8282\u70b9\u4e5f\u662f\u6267\u884c\u8282\u70b9\u3002\u4e0ePytorch DDP\u7684\u65b9\u5f0f\u4e0d\u540c\uff0cmpirun\u6ca1\u6709\u63d0\u4f9b",(0,o.yg)("inlineCode",{parentName:"p"},"master_addr"),"\u548c",(0,o.yg)("inlineCode",{parentName:"p"},"master_port"),"\uff0c\u800c\u662f\u7531MPI\u7684\u901a\u4fe1\u673a\u5236\u5efa\u7acb\u5bb9\u5668\u4e4b\u95f4\u7684\u8fdb\u7a0b\u62d3\u6251\u5173\u7cfb\u3002\u56e0\u6b64MPI\u5206\u5e03\u5f0f\u4efb\u52a1\u7684\u542f\u52a8\u547d\u4ee4\u5fc5\u987b\u662f\u5982mpirun\u7684\u542f\u52a8\u547d\u4ee4\u6216\u8005\u5305\u542bmpirun\u542f\u52a8\u547d\u4ee4\u7684\u811a\u672c\u3002\u5e38\u89c1\u7684MPI\u5206\u5e03\u5f0f\u8bad\u7ec3\u4efb\u52a1\u6709\uff1a"),(0,o.yg)("ul",null,(0,o.yg)("li",{parentName:"ul"},"Pytorch+Horovod"),(0,o.yg)("li",{parentName:"ul"},"Pytorch +MPI"),(0,o.yg)("li",{parentName:"ul"},"TensorFlow+MPI "),(0,o.yg)("li",{parentName:"ul"},"TensorFlow+Horovod")),(0,o.yg)("p",null,"MPI\u4efb\u52a1\u5728\u542f\u52a8\u65f6\u4f1a\u505a\u5982\u4e0b\u4e8b\u60c5\uff1a"),(0,o.yg)("ol",null,(0,o.yg)("li",{parentName:"ol"},"\u6536\u96c6\u5f53\u524dMPI\u4efb\u52a1\u6240\u6709\u7684Podname\u5230MPI Launcher Pod\u7684/etc/mpi/hostfile \u6587\u4ef6\u4e2d, \u683c\u5f0f\u5982\u4e0b\uff1a")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"MPI-zjwghlra-launcher slots=8\nMPI-zjwghlra-worker-0 slots=8\nMPI-zjwghlra-worker-1 slots=8\nMPI-zjwghlra-worker-2 slots=8\nMPI-zjwghlra-worker-3 slots=8\nMPI-zjwghlra-worker-4 slots=8\nMPI-zjwghlra-worker-5 slots=8\nMPI-zjwghlra-worker-6 slots=8\n")),(0,o.yg)("ol",{start:2},(0,o.yg)("li",{parentName:"ol"},(0,o.yg)("p",{parentName:"li"},"\u5728MPI Launcher Pod\u4e2d\u589e\u52a0",(0,o.yg)("inlineCode",{parentName:"p"},"OMPI_MCA_plm_rsh_agent=/etc/mpi/kubexec.sh"),"\u73af\u5883\u53d8\u91cf\u4f7f\u5f97mpirun\u5efa\u7acb\u8fdb\u7a0b\u901a\u4fe1\u65f6\u53ef\u4ee5\u8d70\u8be5\u4ee3\u7406\u901a\u9053\uff0c\u4ece\u800c\u65e0\u9700\u5efa\u7acbssh\u8fde\u63a5\uff1b\u540c\u65f6Launcher Pod\u4e2d\u8981\u6267\u884c\u7684\u547d\u4ee4\u4e5f\u662f\u901a\u8fc7\u8be5\u4ee3\u7406\u901a\u9053\u4e0b\u53d1\u5230\u5404\u4e2aWorker\u4e2d\u7684\u3002")),(0,o.yg)("li",{parentName:"ol"},(0,o.yg)("p",{parentName:"li"},"\u5728MPI Launcher Pod\u4e2d\u589e\u52a0",(0,o.yg)("inlineCode",{parentName:"p"},"OMPI_MCA_orte_default_hostfile=/etc/mpi/hostfile"),"\u73af\u5883\u53d8\u91cf\u6765\u8bbe\u7f6e\u9ed8\u8ba4hostfile\uff0c\u8fd9\u6837\u7528\u6237\u5728\u6267\u884cmpirun\u547d\u4ee4\u7684\u65f6\u5019\u5c31\u65e0\u9700\u624b\u52a8\u6307\u5b9ahostfile\u3002"))),(0,o.yg)("p",null,"\u6b64\u5916\u9488\u5bf9MPI\u542f\u52a8\u547d\u4ee4\uff0c\u5fc5\u987b\u589e\u52a0\u4e00\u4e9b\u5fc5\u5907\u9879\uff0c\u624d\u80fd\u4fdd\u969c\u6b63\u786e\u6267\u884c\uff0c\u5982\u4e0b\u6240\u793a\uff1a"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"mpirun --allow-run-as-root -bind-to none -map-by slot -mca pml ob1 -mca btl ^openib -mca plm_rsh_num_concurrent 300 -mca routed_radix 600 -mca plm_rsh_no_tree_spawn 1\n")),(0,o.yg)("p",null,"\u8be6\u7ec6\u89e3\u91ca\uff1a"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"mpirun \\\n --allow-run-as-root \\ ## \u6307\u7684\u662f\u5141\u8bb8root\u8eab\u4efd\u6267\u884c\u7a0b\u5e8f\uff0c\u9ed8\u8ba4\u662f\u4e0d\u5141\u8bb8\n -bind-to none -map-by slot \\ ## openMPI\u624d\u4f1a\u7528\u5230\uff0c\u6307\u7684\u662f\u4e0d\u81ea\u52a8\u7ed1\u5b9acpu\u6838\u5fc3\uff0c\u53ef\u4ee5\u4f7f\u7528\u8d85\u7ebf\u7a0b\uff0c\u5e76\u4e14\u6309\u7167\u8bbe\u7f6e\u7684slot\u8fdb\u884c\u6620\u5c04\uff0c\u6211\u4eec\u9ed8\u8ba4\u7684\u6bcf\u4e2a\u8282\u70b9\u7684slot=\u7528\u6237\u8bbe\u7f6e\u7684Pod GPU\u6570\u91cf \n -mca pml ob1 -mca btl ^openib \\ ## openMPI\u5efa\u7acb\u8fdb\u7a0bsocket\u7684\u5f3a\u5236\u4f7f\u7528ob1 PML\u65b9\u5f0f\uff0c\u5efa\u7acb\u8fde\u63a5\u4e0d\u4f7f\u7528IB\u7f51\n -mca plm_rsh_num_concurrent 300 \\ ## openMPI \u6307\u5b9a\u8981\u540c\u65f6\u8c03\u7528\u7684plm_rsh_agent\u5b9e\u4f8b\u6570\n -mca routed_radix 600 \\ ## \u89e3\u51b3\u9650\u5236Pod\u5e76\u53d1\u6570\u91cf\u95ee\u9898\uff0c\u5728\u8d77\u7684MPI \u5bb9\u5668\u5c11\u4e8e65\u4e2a\u65f6\u4e0d\u7528\u52a0\uff0c\u5927\u4e8e\u7b49\u4e8e65\u5c31\u9700\u8981\u52a0\u8fd9\u4e2a\uff0c\u5426\u5219\u4f1a\u88abopenMPI\u7ea6\u675f\n -mca plm_rsh_no_tree_spawn 1 \\ ## openMPI\u6307\u5b9a\u662f\u5426\u4f7f\u7528\u57fa\u4e8e\u6811\u7684\u62d3\u6251\u542f\u52a8\u5e94\u7528\u7a0b\u5e8f\uff0c1\u4e3a\u5141\u8bb8\n -np 1 \\ ## \u8868\u793a\u8bad\u7ec3\u4efb\u52a1\u4f7f\u7528\u7684\u8fdb\u7a0b\u6570\uff0c\u5176\u503c\u5c0f\u4e8e\u7b49\u4e8eGPU\u603b\u6570\uff0c\u59824\u673a8\u5361\uff0c\u5c31\u662f4*8=32\uff1b\u5f53np\u5c0f\u4e8eGPU\u603b\u6570\u65f6\u4f1a\u6709 (GPU\u603b\u6570-np) \u4e2aGPU\u65e0\u6cd5\u88ab\u4f7f\u7528\n")),(0,o.yg)("p",null,"\u5bf9\u4e8e\u4f7f\u7528RoCE\u7684\u7b97\u529b\u6c60\u7684\u8bad\u7ec3\u4efb\u52a1\uff0c\u53ef\u4ee5\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u589e\u52a0\u5982\u4e0b\u73af\u5883\u53d8\u91cf\u83b7\u5f97\u6700\u4f18\u6027\u80fd\uff0c \u8fd9\u4e2a\u73af\u5883\u53d8\u91cf\u9700\u8981\u5199\u5230\u7528\u6237\u5f97\u4efb\u52a1\u811a\u672c\u4e2d\u6216\u8005\u901a\u8fc7mpirun -x\u7684\u53c2\u6570\u9879\u5e26\u5230\u8fdb\u7a0b\u4e2d\uff1a"),(0,o.yg)("blockquote",null,(0,o.yg)("p",{parentName:"blockquote"},"\u6ce8\u610f\uff1a\u5bf9\u4e8e\u4f7f\u7528IB\u7c7b\u578b\u7684\u7b97\u529b\u6c60\u4efb\u52a1\uff0c\u53ef\u4ee5\u4e0d\u52a0",(0,o.yg)("inlineCode",{parentName:"p"},"NCCL_IB_TC"),"\u3001",(0,o.yg)("inlineCode",{parentName:"p"},"NCCL_IB_GID_INDEX"),"\u8fd9\u4e24\u4e2a\u73af\u5883\u53d8\u91cf\uff0c\u52a0\u4e86\u53ef\u80fd\u4f1a\u6bd4\u8f83\u6162")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"export NCCL_DEBUG=INFO # \u8fd9\u4e2a\u4e0e\u6027\u80fd\u65e0\u5173\uff0c\u53ea\u662f\u4fbf\u4e8e\u6392\u67e5\u95ee\u9898\nexport NCCL_IB_TC=106 # \u6307\u5b9aNCCL\u4f7f\u7528\u7684\u4ea4\u6362\u673a\u901a\u9053 \nexport NCCL_IB_GID_INDEX=3 # \u9009\u62e9\u6307\u5b9a\u7684IB index \nexport NCCL_SOCKET_IFNAME=eth0 # \u5728\u6784\u5efaNCCL socket\u65f6\u9009\u62e9eth0\u7f51\u7edc\nexport NCCL_CROSS_NIC=0 # \u56fa\u5b9a\u6bcf\u4e2a\u7f51\u5361\u7684\u8fde\u63a5\u901a\u9053\n\n#\u5f53\u8bad\u7ec3\u7684\u89c4\u6a21\u8fbe\u5230\u5343\u5361\u53ca\u4ee5\u4e0a\u65f6\uff0c\u53ef\u4ee5\u589e\u52a0\u5982\u4e0b\u73af\u5883\u53d8\u91cf\uff1a\nexport NCCL_ALGO=RING\n")),(0,o.yg)("h4",{id:"ui\u754c\u9762\u63d0\u4ea4mpi\u4efb\u52a1\u793a\u4f8b"},"UI\u754c\u9762\u63d0\u4ea4MPI\u4efb\u52a1\u793a\u4f8b"),(0,o.yg)("blockquote",null,(0,o.yg)("p",{parentName:"blockquote"},"\u6ce8\u610f\uff1a\u7b97\u529b\u6c60\u5f53\u524d\u7684MPI\u4efb\u52a1\u53ea\u652f\u6301openmpi\uff01\uff01\uff01\u539f\u56e0\u662f\u793e\u533a\u4e2d\u91c7\u7528\u9884\u914d\u7f6e\u8282\u70b9\u7684\u65b9\u5f0f\u4e3aopenmpi\u547d\u4ee4\u72ec\u6709\u7684\u53c2\u6570\uff0c\u5176\u4ed6MPI\u5b9e\u73b0\u53ef\u80fd\u4f1a\u6709\u6240\u5dee\u522b\uff0c\u6682\u65f6\u8fd8\u4e0d\u652f\u6301\u3002",(0,o.yg)("a",{parentName:"p",href:"https://www.open-MPI.org/"},"https://www.open-MPI.org/"))),(0,o.yg)("p",null,"\u5728a100_RoCE_1024\u7b97\u529b\u6c60\u3010RoCE\u7c7b\u578b\u7684\u7b97\u529b\u6c60\u3011\u4e0a\u542f\u52a8\u4e00\u4e2a128\u673a8\u5361\u5171\u8ba11024\u5361\u7684NCCL-test \u4efb\u52a1\u3002\n\u542f\u52a8\u547d\u4ee4\u5982\u4e0b\uff1a"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"mpirun --allow-run-as-root \\\n-bind-to none -map-by slot \\\n-mca pml ob1 -mca btl ^openib \\\n-mca plm_rsh_num_concurrent 300 \\\n-mca routed_radix 600 \\\n-mca plm_rsh_no_tree_spawn 1 \\\n-x NCCL_DEBUG=INFO \\\n-x NCCL_IB_GID_INDEX=3 \\\n-x NCCL_IB_TC=106 \\\n-x NCCL_CROSS_NIC=0 \\\n-x NCCL_ALGO=RING \\\n-x NCCL_SOCKET_IFNAME=eth0 \\\n-np 1024 \\\n/root/nccl_test/build/all_reduce_perf -b 4M -e 1024M -f 2 -g 1\n")),(0,o.yg)("p",null,(0,o.yg)("img",{src:r(98432).A,width:"845",height:"798"})),(0,o.yg)("p",null,(0,o.yg)("img",{src:r(49759).A,width:"1222",height:"896"})),(0,o.yg)("p",null,"\u4e0a\u9762\u662fRoCE\u7c7b\u578b\u7b97\u529b\u6c60MPI\u4efb\u52a1\u7684\u793a\u4f8b\uff0c\u5982\u679c\u662fIB\u7c7b\u578b\u7684\u7b97\u529b\u6c60\uff0c\u63d0\u4ea4\u7684\u6d41\u7a0b\u5b8c\u5168\u76f8\u540c\uff0c\u53ea\u9700\u8981\u53bb\u6389\u542f\u52a8\u547d\u4ee4\u4e2d\u7684"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"-x NCCL_IB_GID_INDEX=3\n-x NCCL_IB_TC=106\n")),(0,o.yg)("p",null,"\u8fd9\u4e24\u4e2a\u73af\u5883\u53d8\u91cf\u5373\u53ef\u3002"),(0,o.yg)("blockquote",null,(0,o.yg)("p",{parentName:"blockquote"},"\u6ce8\u610f\uff1a\u6211\u4eec\u5728\u65b0\u7248\u672c\u7684\u9ad8\u6027\u80fdAI\u7b97\u529b\u6c60\u4e2d\u7b80\u5316\u4e86\u4efb\u52a1\u63d0\u4ea4\u6d41\u7a0b\uff0c\u60a8\u53ea\u9700\u8981\u6307\u5b9a\u4e00\u4e2a\u89d2\u8272\u6570\u91cf\u548c\u89c4\u683c\u3002\u82e5\u60a8\u6307\u5b9a\u548cn\u4e2a\u89d2\u8272\u6570\u91cf\uff0c\u6211\u4eec\u4f1a\u81ea\u52a8\u4e3a\u60a8\u4ee51\u4e2aLauncher\u89d2\u8272\u548cn-1\u4e2aWorker\u89d2\u8272\u542f\u52a8\u4efb\u52a1\u3002")))}m.isMDXComponent=!0},98432:(e,n,r)=>{r.d(n,{A:()=>t});const t=r.p+"assets/images/acp_job3-8bf1135a32f9a03adbc443bac9d105f4.png"},49759:(e,n,r)=>{r.d(n,{A:()=>t});const t=r.p+"assets/images/acp_job4-6d2c3a4218cfa293e6c6ac0e99f32e1e.png"}}]);