整除分块十分naive,但是卡常之后就不清真了。
常数优化也是一门技术啊!
如果需要计算\[\sum_{i=1}^n\lfloor \frac{n}{i} \rfloor\]
有一个naive的做法就是
for (long long i=1,la; i<=n; i=la+1){la=n/(n/i);ans+=(n/i)*(la-i+1);
}
但是,这样不仅根号有2的常数,瓶颈上还有3次除法(可优化至2次),如果n是一个较大的数,跑起来很man。
今天突然看到了松1自己的提交,于是兴冲冲地又复习了一下优越的算法。
首先推式子
要求\[\sum_{i=1}^n \sum_{j=1}^n [i*j \leq n] \]
可拆为\[ \sum_{i=1}^{ \lfloor \sqrt {n} \rfloor} \sum_{j=1}^n [i*j \leq n] +\sum_{i= \lfloor \sqrt{n} \rfloor +1}^n \sum_{j=1}^n [i*j \leq n] \]
变换边界条件\[ \sum_{i=1}^{ \lfloor \sqrt {n} \rfloor} \sum_{j=1}^n [i*j \leq n] +\sum_{i= \lfloor \sqrt{n} \rfloor +1}^n \sum_{j=1}^{\lfloor \sqrt{n} \rfloor} [i*j \leq n] \]
在把前后两项变得一样
\[ \sum_{i=1}^{ \lfloor \sqrt {n} \rfloor} \sum_{j=1}^n [i*j \leq n] +\sum_{i=1}^n \sum_{j=1}^{\lfloor \sqrt{n} \rfloor} [i*j \leq n]-\sum_{i=1}^{ \lfloor \sqrt {n} \rfloor} \sum_{j=1}^{\lfloor \sqrt {n} \rfloor}[i*j \leq n]\]
合并一下
\[2* \sum_{i=1}^{\lfloor \sqrt {n} \rfloor} \sum_{j=1}^n [i*j \leq n] - ( \lfloor \sqrt{n} \rfloor)^2 \]
换一种表示
\[2* \sum_{i=1}^{\lfloor \sqrt{n} \rfloor} \lfloor \frac{n}{i} \rfloor -(\lfloor \sqrt{n} \rfloor) ^2 \]
就可以快速计算啦!
%:pragma GCC target("avx")
%:pragma GCC optimize(3)
%:pragma GCC optimize("Ofast")
%:pragma GCC optimize("inline")
%:pragma GCC optimize("-fgcse")
%:pragma GCC optimize("-fgcse-lm")
%:pragma GCC optimize("-fipa-sra")
%:pragma GCC optimize("-ftree-pre")
%:pragma GCC optimize("-ftree-vrp")
%:pragma GCC optimize("-fpeephole2")
%:pragma GCC optimize("-ffast-math")
%:pragma GCC optimize("-fsched-spec")
%:pragma GCC optimize("unroll-loops")
%:pragma GCC optimize("-falign-jumps")
%:pragma GCC optimize("-falign-loops")
%:pragma GCC optimize("-falign-labels")
%:pragma GCC optimize("-fdevirtualize")
%:pragma GCC optimize("-fcaller-saves")
%:pragma GCC optimize("-fcrossjumping")
%:pragma GCC optimize("-fthread-jumps")
%:pragma GCC optimize("-funroll-loops")
%:pragma GCC optimize("-fwhole-program")
%:pragma GCC optimize("-freorder-blocks")
%:pragma GCC optimize("-fschedule-insns")
%:pragma GCC optimize("inline-functions")
%:pragma GCC optimize("-ftree-tail-merge")
%:pragma GCC optimize("-fschedule-insns2")
%:pragma GCC optimize("-fstrict-aliasing")
%:pragma GCC optimize("-fstrict-overflow")
%:pragma GCC optimize("-falign-functions")
%:pragma GCC optimize("-fcse-skip-blocks")
%:pragma GCC optimize("-fcse-follow-jumps")
%:pragma GCC optimize("-fsched-interblock")
%:pragma GCC optimize("-fpartial-inlining")
%:pragma GCC optimize("no-stack-protector")
%:pragma GCC optimize("-freorder-functions")
%:pragma GCC optimize("-findirect-inlining")
%:pragma GCC optimize("-frerun-cse-after-loop")
%:pragma GCC optimize("inline-small-functions")
%:pragma GCC optimize("-finline-small-functions")
%:pragma GCC optimize("-ftree-switch-conversion")
%:pragma GCC optimize("-foptimize-sibling-calls")
%:pragma GCC optimize("-fexpensive-optimizations")
%:pragma GCC optimize("-funsafe-loop-optimizations")
%:pragma GCC optimize("inline-functions-called-once")
%:pragma GCC optimize("-fdelete-null-pointer-checks")
#include <iostream>
#include <cmath>
using namespace std;
typedef unsigned long long ll;
int main(){ll n; cin>>n;ll ans=0;ll p=sqrt(n);for (ll i=p; i; --i) ans+=n/i;ans=ans*2-p*p;cout<<ans<<endl;
}
还不够快?
利用\[ \lfloor \frac{n}{2i} \rfloor =\lfloor \frac{\lfloor \frac{n}{i} \rfloor}{2} \rfloor \]
可以优化
%:pragma GCC target("avx")
%:pragma GCC optimize(3)
%:pragma GCC optimize("Ofast")
%:pragma GCC optimize("inline")
%:pragma GCC optimize("-fgcse")
%:pragma GCC optimize("-fgcse-lm")
%:pragma GCC optimize("-fipa-sra")
%:pragma GCC optimize("-ftree-pre")
%:pragma GCC optimize("-ftree-vrp")
%:pragma GCC optimize("-fpeephole2")
%:pragma GCC optimize("-ffast-math")
%:pragma GCC optimize("-fsched-spec")
%:pragma GCC optimize("unroll-loops")
%:pragma GCC optimize("-falign-jumps")
%:pragma GCC optimize("-falign-loops")
%:pragma GCC optimize("-falign-labels")
%:pragma GCC optimize("-fdevirtualize")
%:pragma GCC optimize("-fcaller-saves")
%:pragma GCC optimize("-fcrossjumping")
%:pragma GCC optimize("-fthread-jumps")
%:pragma GCC optimize("-funroll-loops")
%:pragma GCC optimize("-fwhole-program")
%:pragma GCC optimize("-freorder-blocks")
%:pragma GCC optimize("-fschedule-insns")
%:pragma GCC optimize("inline-functions")
%:pragma GCC optimize("-ftree-tail-merge")
%:pragma GCC optimize("-fschedule-insns2")
%:pragma GCC optimize("-fstrict-aliasing")
%:pragma GCC optimize("-fstrict-overflow")
%:pragma GCC optimize("-falign-functions")
%:pragma GCC optimize("-fcse-skip-blocks")
%:pragma GCC optimize("-fcse-follow-jumps")
%:pragma GCC optimize("-fsched-interblock")
%:pragma GCC optimize("-fpartial-inlining")
%:pragma GCC optimize("no-stack-protector")
%:pragma GCC optimize("-freorder-functions")
%:pragma GCC optimize("-findirect-inlining")
%:pragma GCC optimize("-frerun-cse-after-loop")
%:pragma GCC optimize("inline-small-functions")
%:pragma GCC optimize("-finline-small-functions")
%:pragma GCC optimize("-ftree-switch-conversion")
%:pragma GCC optimize("-foptimize-sibling-calls")
%:pragma GCC optimize("-fexpensive-optimizations")
%:pragma GCC optimize("-funsafe-loop-optimizations")
%:pragma GCC optimize("inline-functions-called-once")
%:pragma GCC optimize("-fdelete-null-pointer-checks")
%:pragma GCC target("sse2,sse3,ssse3,sse4")
#include <iostream>
#include <cmath>
using namespace std;
typedef long long ll;
int main(){ll n; cin>>n;ll ans=0;ll p=sqrt(n),z=n/p;for (ll i=1; i<=p; i+=2){ll t=n/i;while (t>=z){ans+=t;t>>=1;}}ans=ans*2-p*p;cout<<ans;
}
还不够快,减少一次判断?
#pragma GCC target("avx")
#pragma GCC optimize(3)
#pragma GCC optimize("Ofast")
#pragma GCC optimize("inline")
#pragma GCC optimize("-fgcse")
#pragma GCC optimize("-fgcse-lm")
#pragma GCC optimize("-fipa-sra")
#pragma GCC optimize("-ftree-pre")
#pragma GCC optimize("-ftree-vrp")
#pragma GCC optimize("-fpeephole2")
#pragma GCC optimize("-ffast-math")
#pragma GCC optimize("-fsched-spec")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("-falign-jumps")
#pragma GCC optimize("-falign-loops")
#pragma GCC optimize("-falign-labels")
#pragma GCC optimize("-fdevirtualize")
#pragma GCC optimize("-fcaller-saves")
#pragma GCC optimize("-fcrossjumping")
#pragma GCC optimize("-fthread-jumps")
#pragma GCC optimize("-funroll-loops")
#pragma GCC optimize("-fwhole-program")
#pragma GCC optimize("-freorder-blocks")
#pragma GCC optimize("-fschedule-insns")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("-ftree-tail-merge")
#pragma GCC optimize("-fschedule-insns2")
#pragma GCC optimize("-fstrict-aliasing")
#pragma GCC optimize("-fstrict-overflow")
#pragma GCC optimize("-falign-functions")
#pragma GCC optimize("-fcse-skip-blocks")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-freorder-functions")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("-funsafe-loop-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")
#pragma GCC target("sse2,sse3,ssse3,sse4")
#include <iostream>
#include <cmath>
using namespace std;
typedef long long ll;
int main(){ll n; cin>>n;ll ans=0;ll p=sqrt(n),z=n/p;for (ll i=1,t=n; i<=p; t=n/(i+=2))do{ans+=t;}while ((t>>=1)>=z);ans=ans*2-p*p;cout<<ans;
}
显然这还是没有到极致,不过我觉得已经挺快了。
1e16在机房的普通台式机上只需0.5s
还不够快?优化除法次数吧!
#pragma GCC target("avx")
#pragma GCC optimize(3)
#pragma GCC optimize("Ofast")
#pragma GCC optimize("inline")
#pragma GCC optimize("-fgcse")
#pragma GCC optimize("-fgcse-lm")
#pragma GCC optimize("-fipa-sra")
#pragma GCC optimize("-ftree-pre")
#pragma GCC optimize("-ftree-vrp")
#pragma GCC optimize("-fpeephole2")
#pragma GCC optimize("-ffast-math")
#pragma GCC optimize("-fsched-spec")
#pragma GCC optimize("unroll-loops")
#pragma GCC optimize("-falign-jumps")
#pragma GCC optimize("-falign-loops")
#pragma GCC optimize("-falign-labels")
#pragma GCC optimize("-fdevirtualize")
#pragma GCC optimize("-fcaller-saves")
#pragma GCC optimize("-fcrossjumping")
#pragma GCC optimize("-fthread-jumps")
#pragma GCC optimize("-funroll-loops")
#pragma GCC optimize("-fwhole-program")
#pragma GCC optimize("-freorder-blocks")
#pragma GCC optimize("-fschedule-insns")
#pragma GCC optimize("inline-functions")
#pragma GCC optimize("-ftree-tail-merge")
#pragma GCC optimize("-fschedule-insns2")
#pragma GCC optimize("-fstrict-aliasing")
#pragma GCC optimize("-fstrict-overflow")
#pragma GCC optimize("-falign-functions")
#pragma GCC optimize("-fcse-skip-blocks")
#pragma GCC optimize("-fcse-follow-jumps")
#pragma GCC optimize("-fsched-interblock")
#pragma GCC optimize("-fpartial-inlining")
#pragma GCC optimize("no-stack-protector")
#pragma GCC optimize("-freorder-functions")
#pragma GCC optimize("-findirect-inlining")
#pragma GCC optimize("-frerun-cse-after-loop")
#pragma GCC optimize("inline-small-functions")
#pragma GCC optimize("-finline-small-functions")
#pragma GCC optimize("-ftree-switch-conversion")
#pragma GCC optimize("-foptimize-sibling-calls")
#pragma GCC optimize("-fexpensive-optimizations")
#pragma GCC optimize("-funsafe-loop-optimizations")
#pragma GCC optimize("inline-functions-called-once")
#pragma GCC optimize("-fdelete-null-pointer-checks")
#pragma GCC target("sse2,sse3,ssse3,sse4")
#include <iostream>
#include <cmath>
using namespace std;
typedef long long ll;
#define C 13
int main(){ll n; cin>>n;ll ans=0;ll p=sqrt(n),z=n/p;for (ll i=1,t=n,la=n+C; i<=p;){ll tmp=t;do{ans+=tmp;}while ((tmp>>=1)>=z);if (la-t<C){la=t;i+=2;ll g=i*(--t);while (g>n){--t;g-=i;}}else{la=t;t=n/(i+=2);}}ans=ans*2-p*p;cout<<ans;
}
现在1e16只需0.38s左右
这东西貌似有一个\(log\)做法,先咕着。