"use strict";(self.webpackChunkelementary_public_docs=self.webpackChunkelementary_public_docs||[]).push([[14792],{15680:(e,t,r)=>{r.d(t,{xA:()=>u,yg:()=>m});var n=r(96540);function o(e,t,r){return t in e?Object.defineProperty(e,t,{value:r,enumerable:!0,configurable:!0,writable:!0}):e[t]=r,e}function c(e,t){var r=Object.keys(e);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(e);t&&(n=n.filter((function(t){return Object.getOwnPropertyDescriptor(e,t).enumerable}))),r.push.apply(r,n)}return r}function a(e){for(var t=1;t=0||(o[r]=e[r]);return o}(e,t);if(Object.getOwnPropertySymbols){var c=Object.getOwnPropertySymbols(e);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(e,r)&&(o[r]=e[r])}return o}var i=n.createContext({}),l=function(e){var t=n.useContext(i),r=t;return e&&(r="function"==typeof e?e(t):a(a({},t),e)),r},u=function(e){var t=l(e.components);return n.createElement(i.Provider,{value:t},e.children)},s="mdxType",d={inlineCode:"code",wrapper:function(e){var t=e.children;return n.createElement(n.Fragment,{},t)}},y=n.forwardRef((function(e,t){var r=e.components,o=e.mdxType,c=e.originalType,i=e.parentName,u=p(e,["components","mdxType","originalType","parentName"]),s=l(r),y=o,m=s["".concat(i,".").concat(y)]||s[y]||d[y]||c;return r?n.createElement(m,a(a({ref:t},u),{},{components:r})):n.createElement(m,a({ref:t},u))}));function m(e,t){var r=arguments,o=t&&t.mdxType;if("string"==typeof e||o){var c=r.length,a=new Array(c);a[0]=y;var p={};for(var i in t)hasOwnProperty.call(t,i)&&(p[i]=t[i]);p.originalType=e,p[s]="string"==typeof e?e:o,a[1]=p;for(var l=2;l{r.r(t),r.d(t,{assets:()=>i,contentTitle:()=>a,default:()=>d,frontMatter:()=>c,metadata:()=>p,toc:()=>l});var n=r(58168),o=(r(96540),r(15680));const c={sidebar_position:2,product:"\u9ad8\u6027\u80fdAI\u7b97\u529b\u6c60(ACP)"},a="\u63d0\u4ea4\u4e00\u4e2aPytorch DDP\u5206\u5e03\u5f0f\u4efb\u52a1",p={unversionedId:"cloud-foundation/compute/acp/acpBestPractices/Job-PyTorchDDP",id:"cloud-foundation/compute/acp/acpBestPractices/Job-PyTorchDDP",title:"\u63d0\u4ea4\u4e00\u4e2aPytorch DDP\u5206\u5e03\u5f0f\u4efb\u52a1",description:"\u5bf9\u4e8ePytorch DDP\u5206\u5e03\u5f0f\u4efb\u52a1\uff0c\u6211\u4eec\u4f1a\u521b\u5efa\u4e24\u79cd\u7c7b\u578b\u7684Pod\uff1aMaster\u548cWorker\uff0c\u5176\u4e2dMaster\u4e3aWorker\u63d0\u4f9b\u4e86\u53ef\u4ee5\u8bbf\u95ee\u7684Master IP\u5730\u5740\u3001Master \u670d\u52a1\u7aef\u53e3\uff0c\u5e76\u4e14\u81ea\u52a8\u628aPytorch\u4efb\u52a1\u4e2d\u5168\u90e8\u7684\u8fdb\u7a0b\u6570\u91cf\u548c\u6bcf\u4e2aPod\u7684\u8fdb\u7a0bID\u90fd\u8f93\u51fa\u5230\u4e86\u73af\u5883\u53d8\u91cf\u4e2d\uff0c\u6240\u4ee5\u7528\u6237\u5728\u4f7f\u7528Pytorch\u5206\u5e03\u5f0f\u8bad\u7ec3\u4efb\u52a1\u7684\u65f6\u5019\uff0c\u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528\u5982\u4e0b\u73af\u5883\u53d8\u91cf\u540d\u79f0\u66ff\u6362\u8bad\u7ec3\u811a\u672c\u4e2d\u7684Master ip\u3001Master\u7aef\u53e3\u3001\u8fdb\u7a0b\u603b\u6570\u548c\u8fdb\u7a0brank\u3002",source:"@site/docs/cloud-foundation/compute/acp/acpBestPractices/Job-PyTorchDDP.md",sourceDirName:"cloud-foundation/compute/acp/acpBestPractices",slug:"/cloud-foundation/compute/acp/acpBestPractices/Job-PyTorchDDP",permalink:"/help/docs/cloud-foundation/compute/acp/acpBestPractices/Job-PyTorchDDP",draft:!1,editUrl:"https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/docs/cloud-foundation/compute/acp/acpBestPractices/Job-PyTorchDDP.md",tags:[],version:"current",sidebarPosition:2,frontMatter:{sidebar_position:2,product:"\u9ad8\u6027\u80fdAI\u7b97\u529b\u6c60(ACP)"},sidebar:"tutorialSidebar",previous:{title:"MindSpore InternLM2-20B 32\u5361 \u5fae\u8c03\u6700\u4f73\u5b9e\u8df5",permalink:"/help/docs/cloud-foundation/compute/acp/acpBestPractices/Job-Mindspore-InternLM"},next:{title:"\u3010\u5feb\u901f\u5f00\u59cb\u3011\u5fae\u8c03Llama-3-8B-Instruct\u6a21\u578b\u6700\u4f73\u5b9e\u8df5",permalink:"/help/docs/cloud-foundation/compute/acp/acpBestPractices/Job-QuickStart-Llama3-8B"}},i={},l=[{value:"UI\u754c\u9762\u63d0\u4ea4DDP\u4efb\u52a1\u793a\u4f8b",id:"ui\u754c\u9762\u63d0\u4ea4ddp\u4efb\u52a1\u793a\u4f8b",level:4}],u={toc:l},s="wrapper";function d(e){let{components:t,...c}=e;return(0,o.yg)(s,(0,n.A)({},u,c,{components:t,mdxType:"MDXLayout"}),(0,o.yg)("h1",{id:"\u63d0\u4ea4\u4e00\u4e2apytorch-ddp\u5206\u5e03\u5f0f\u4efb\u52a1"},"\u63d0\u4ea4\u4e00\u4e2aPytorch DDP\u5206\u5e03\u5f0f\u4efb\u52a1"),(0,o.yg)("p",null,"\u5bf9\u4e8ePytorch DDP\u5206\u5e03\u5f0f\u4efb\u52a1\uff0c\u6211\u4eec\u4f1a\u521b\u5efa\u4e24\u79cd\u7c7b\u578b\u7684Pod\uff1aMaster\u548cWorker\uff0c\u5176\u4e2dMaster\u4e3aWorker\u63d0\u4f9b\u4e86\u53ef\u4ee5\u8bbf\u95ee\u7684Master IP\u5730\u5740\u3001Master \u670d\u52a1\u7aef\u53e3\uff0c\u5e76\u4e14\u81ea\u52a8\u628aPytorch\u4efb\u52a1\u4e2d\u5168\u90e8\u7684\u8fdb\u7a0b\u6570\u91cf\u548c\u6bcf\u4e2aPod\u7684\u8fdb\u7a0bID\u90fd\u8f93\u51fa\u5230\u4e86\u73af\u5883\u53d8\u91cf\u4e2d\uff0c\u6240\u4ee5\u7528\u6237\u5728\u4f7f\u7528Pytorch\u5206\u5e03\u5f0f\u8bad\u7ec3\u4efb\u52a1\u7684\u65f6\u5019\uff0c\u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528\u5982\u4e0b\u73af\u5883\u53d8\u91cf\u540d\u79f0\u66ff\u6362\u8bad\u7ec3\u811a\u672c\u4e2d\u7684Master ip\u3001Master\u7aef\u53e3\u3001\u8fdb\u7a0b\u603b\u6570\u548c\u8fdb\u7a0brank\u3002"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"$MASTER_ADDR \n$MASTER_PORT \n$WORLD_SIZE \n$RANK\n")),(0,o.yg)("p",null,"\u4e3e\u4f8b\u5982\u4e0b\uff1a"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"python -m distributed.launch --nproc-per-node 2 --nnodes $WORLD_SIZE --node_rank $RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT mnist.py --backend nccl\n")),(0,o.yg)("p",null,"\u5bf9\u4e8e\u4f7f\u7528RoCE\u7684\u7b97\u529b\u6c60\u7684\u8bad\u7ec3\u4efb\u52a1\uff0c\u53ef\u4ee5\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u589e\u52a0\u5982\u4e0b\u73af\u5883\u53d8\u91cf\u83b7\u5f97\u6700\u4f18\u6027\u80fd\uff1a"),(0,o.yg)("blockquote",null,(0,o.yg)("p",{parentName:"blockquote"},"\u6ce8\u610f\uff1a\u5bf9\u4e8e\u4f7f\u7528IB\u7c7b\u578b\u7684\u7b97\u529b\u6c60\u4efb\u52a1\uff0c\u53ef\u4ee5\u4e0d\u52a0",(0,o.yg)("inlineCode",{parentName:"p"},"NCCL_IB_TC"),"\u3001",(0,o.yg)("inlineCode",{parentName:"p"},"NCCL_IB_GID_INDEX"),"\u8fd9\u4e24\u4e2a\u73af\u5883\u53d8\u91cf\uff0c\u52a0\u4e86\u53ef\u80fd\u4f1a\u6bd4\u8f83\u6162")),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"export NCCL_DEBUG=INFO # \u8fd9\u4e2a\u4e0e\u6027\u80fd\u65e0\u5173\uff0c\u53ea\u662f\u4fbf\u4e8e\u6392\u67e5\u95ee\u9898\nexport NCCL_IB_TC=106 # \u6307\u5b9aNCCL\u4f7f\u7528\u7684\u4ea4\u6362\u673a\u901a\u9053 \nexport NCCL_IB_GID_INDEX=3 # \u9009\u62e9\u6307\u5b9a\u7684IB index \nexport NCCL_SOCKET_IFNAME=eth0 # \u5728\u6784\u5efaNCCL socket\u65f6\u9009\u62e9eth0\u7f51\u7edc\nexport NCCL_CROSS_NIC=0 # \u56fa\u5b9a\u6bcf\u4e2a\u7f51\u5361\u7684\u8fde\u63a5\u901a\u9053\n\n#\u5f53\u8bad\u7ec3\u7684\u89c4\u6a21\u8fbe\u5230\u5343\u5361\u53ca\u4ee5\u4e0a\u65f6\uff0c\u53ef\u4ee5\u589e\u52a0\u5982\u4e0b\u73af\u5883\u53d8\u91cf\uff1a\nexport NCCL_ALGO=TREE\n")),(0,o.yg)("h4",{id:"ui\u754c\u9762\u63d0\u4ea4ddp\u4efb\u52a1\u793a\u4f8b"},"UI\u754c\u9762\u63d0\u4ea4DDP\u4efb\u52a1\u793a\u4f8b"),(0,o.yg)("p",null,"\u5728a100_RoCE_1024\u7b97\u529b\u6c60\u3010RoCE\u7c7b\u578b\u7684\u7b97\u529b\u6c60\u3011\u4e0a\u542f\u52a8\u4e00\u4e2a128\u673a8\u5361\u5171\u8ba11024\u5361\u7684DDP Pytorch \u8bad\u7ec3\u4efb\u52a1\u3002"),(0,o.yg)("p",null,"\u542f\u52a8\u547d\u4ee4\u5982\u4e0b\uff1a"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"export NCCL_DEBUG=INFO\nexport NCCL_IB_GID_INDEX=3\nexport NCCL_IB_TC=106\nexport NCCL_CROSS_NIC=0\nexport NCCL_ALGO=RING\nexport NCCL_SOCKET_IFNAME=eth0\nbash ./train/176B_ft_qb_60w_en_80w_20230213.sh $MASTER_ADDR $MASTER_PORT $WORLD_SIZE $RANK\n")),(0,o.yg)("p",null,(0,o.yg)("img",{src:r(27410).A,width:"897",height:"896"})),(0,o.yg)("p",null,(0,o.yg)("img",{src:r(92905).A,width:"1182",height:"848"})),(0,o.yg)("p",null,"\u4e0a\u9762\u662fRoCE\u7c7b\u578b\u7b97\u529b\u6c60\u4efb\u52a1\u7684\u793a\u4f8b\uff0c\u5982\u679c\u662fIB\u7c7b\u578b\u7684\u7b97\u529b\u6c60\uff0c\u63d0\u4ea4\u7684\u6d41\u7a0b\u5b8c\u5168\u76f8\u540c\uff0c\u53ea\u9700\u8981\u53bb\u6389\u542f\u52a8\u547d\u4ee4\u4e2d\u7684"),(0,o.yg)("pre",null,(0,o.yg)("code",{parentName:"pre"},"export NCCL_IB_GID_INDEX=3\nexport NCCL_IB_TC=106\nexport NCCL_CROSS_NIC=0\n")),(0,o.yg)("p",null,"\u8fd93\u4e2a\u73af\u5883\u53d8\u91cf\u5373\u53ef\u3002"),(0,o.yg)("blockquote",null,(0,o.yg)("p",{parentName:"blockquote"},"\u6ce8\u610f\uff1a\u6211\u4eec\u5728\u65b0\u7248\u672c\u7684\u9ad8\u6027\u80fdAI\u7b97\u529b\u6c60\u4e2d\u7b80\u5316\u4e86\u4efb\u52a1\u63d0\u4ea4\u6d41\u7a0b\uff0c\u60a8\u53ea\u9700\u8981\u6307\u5b9a\u4e00\u4e2a\u89d2\u8272\u6570\u91cf\u548c\u89c4\u683c\u3002\u82e5\u60a8\u6307\u5b9a\u548cn\u4e2a\u89d2\u8272\u6570\u91cf\uff0c\u6211\u4eec\u4f1a\u81ea\u52a8\u4e3a\u60a8\u4ee51\u4e2aMaster\u89d2\u8272\u548cn-1\u4e2aWorker\u89d2\u8272\u542f\u52a8\u4efb\u52a1\u3002")))}d.isMDXComponent=!0},27410:(e,t,r)=>{r.d(t,{A:()=>n});const n=r.p+"assets/images/acp_job1-6086dec744e3ae2e8a433c744d876e68.PNG"},92905:(e,t,r)=>{r.d(t,{A:()=>n});const n=r.p+"assets/images/acp_job2-2d2f54dbfc3f9ebfe7b7aaee025fea05.png"}}]);