使用邮箱注册
a. 安装python的kaggle模块,用于管理项目:
% pip install kaggle
DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.
Collecting kaggle
Downloading https://files.pythonhosted.org/packages/f4/de/4f22073f3afa618976ee0721b0deb72b5cde2782057e04a815a6828b53f9/kaggle-1.5.4.tar.gz (54kB)
100% |████████████████████████████████| 61kB 572kB/s
Collecting urllib3<1.25,>=1.21.1 (from kaggle)
Downloading https://files.pythonhosted.org/packages/01/11/525b02e4acc0c747de8b6ccdab376331597c569c42ea66ab0a1dbd36eca2/urllib3-1.24.3-py2.py3-none-any.whl (118kB)
100% |████████████████████████████████| 122kB 1.9MB/s
Requirement already satisfied: six>=1.10 in /Users/gongjing/usr/anaconda2/lib/python2.7/site-packages (from kaggle) (1.11.0)
Requirement already satisfied: certifi in /Users/gongjing/usr/anaconda2/lib/python2.7/site-packages (from kaggle) (2018.1.18)
Requirement already satisfied: python-dateutil in /Users/gongjing/usr/anaconda2/lib/python2.7/site-packages (from kaggle) (2.7.3)
Requirement already satisfied: requests in /Users/gongjing/usr/anaconda2/lib/python2.7/site-packages (from kaggle) (2.14.2)
Collecting tqdm (from kaggle)
Downloading https://files.pythonhosted.org/packages/9f/3d/7a6b68b631d2ab54975f3a4863f3c4e9b26445353264ef01f465dc9b0208/tqdm-4.32.2-py2.py3-none-any.whl (50kB)
100% |████████████████████████████████| 51kB 10.7MB/s
Collecting python-slugify (from kaggle)
Downloading https://files.pythonhosted.org/packages/c1/19/c3cf1dc65e89aa999f85a4a3a4924ccac765a6964b405d487b7b7c8bb39f/python-slugify-3.0.2.tar.gz
Collecting text-unidecode==1.2 (from python-slugify->kaggle)
Downloading https://files.pythonhosted.org/packages/79/42/d717cc2b4520fb09e45b344b1b0b4e81aa672001dd128c180fabc655c341/text_unidecode-1.2-py2.py3-none-any.whl (77kB)
100% |████████████████████████████████| 81kB 12.5MB/s
Building wheels for collected packages: kaggle, python-slugify
Building wheel for kaggle (setup.py) ... done
Stored in directory: /Users/gongjing/Library/Caches/pip/wheels/87/ea/09/173986e395d051411b9d547a69fe96cdc26208cb1bcc3e5567
Building wheel for python-slugify (setup.py) ... done
Stored in directory: /Users/gongjing/Library/Caches/pip/wheels/16/7f/c3/6b0582283ad589d68a306da924a78c74546e010d8106b9b3a9
Successfully built kaggle python-slugify
Installing collected packages: urllib3, tqdm, text-unidecode, python-slugify, kaggle
Successfully installed kaggle-1.5.4 python-slugify-3.0.2 text-unidecode-1.2 tqdm-4.32.2 urllib3-1.24.3
You are using pip version 19.0.3, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.
pip install kaggle 5.16s user 2.41s system 36% cpu 20.634 total
b. 在自己的账户下(https://www.kaggle.com/
gongjing@hekekedeiMac ~/.kaggle % pwd
/Users/gongjing/.kaggle
gongjing@hekekedeiMac ~/.kaggle % ls
kaggle.json
% kaggle competitions download -c titanic
kaggle competitions submit -c titanic -f submission.csv -m "Message"
company | id | age | salary | sex |
---|---|---|---|---|
A | 001 | 13 | 50000 | F |
A | 002 | 25 | 100000 | F |
B | 003 | 50 | 200000 | M |
C | 004 | 40 | 100000 | M |
B | 005 | 40 | 150000 | F |
# 罗列不同的id
# distinct: 表全部字段去重,不是部分字段
select distinct id from table_1
# 统计不同id个数
select count(distinct id) from table_1
# 优化版本的统计不同id个数
select count(*) from
(select distinct id from table_1) tb
# count(*): 包括所有列,相当于行数,不忽略值为NULL的
# count(1):与count(*)一样。
# count(列名):值包含列名所在列,统计时会忽略NULL
# count时需要看所在列是否可能存在空值NULL
# 例子
CREATE TABLE `score` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`sno` int(11) NOT NULL,
`cno` tinyint(4) NOT NULL,
`score` tinyint(4) DEFAULT NULL,
PRIMARY KEY (`id`)
) ;
A.SELECT sum(score) / count(*) FROM score WHERE cno = 2;
B.SELECT sum(score) / count(id) FROM score WHERE cno = 2;
C.SELECT sum(score) / count(sno) FROM score WHERE cno = 2;
D.SELECT sum(score) / count(score) FROM score WHERE cno = 2;
E.SELECT sum(score) / count(1) FROM score WHERE cno = 2;
F.SELECT avg(score) FROM score WHERE cno = 2;
# ABCE:sum(score)除以行数
# DF:sum(score)除以score不为NULL的行数
# avg(score):会忽略空值
# 聚合函数:基本的数据统计,例如计算最大值、最小值、平均值、总数、求和
# 统计不同性别(F、M)中,不同的id个数
select count(distinct id) from table_1
group by sex
# 统计最大/最小/平均年龄
select max(age), min(age), avg(age) from table_1
group by id
# 统计A公司的男女人数
select count(distinct id) from table_1
where company = 'A'
group by sex
# 统计各公司的男性平均年龄,并且仅保留平均年龄30岁以上的公司
select company, avg(age) from table_1
where sex = 'M'
group by company
having avg(age) >30
# 按年龄全局倒序排序取最年迈的10个人
select id,age from table_1
order by age DESC
limit 10
# 将salary转换为收入区间进行分组
# case函数格式:
# case when condition1 value1 condition2 value2 ... else NULL end
select id,
(case when CAST(salary as float)<50000 then '0-5万'
when CAST(salary as float)>=50000 and CAST(salary as float)<100000 then '5-10万'
when CAST(salary as float)>=100000 and CAST(salary as float)<200000 then '10-20万'
when CAST(salary as float)>=100000 then '20万以上'
else NULL and from table_1
<窗口函数> over (partition by <用于分组的列名>
order by <用于排序的列名>)
原则上只能写在select子句中,是对where或者group by子句处理后的结构进行操作
# 将A和B拼接返回
select concat('www', 'iteblog', 'com') from iteblog
# 将字符串按照“,”切分,并返回数组
select split("1,2,3", ",") as value_array from table_1
# 切分后赋值
select value_array[0],value_array[1],value_array[2] from (select split("1,2,3", ",") as value_array from table_1) t
# substr(str,0,len) : 截取从0位开始长度为len的字符串
select substr('abcde', 3, 2) from iteblog # cd
# 按照字段salary倒序排序
select *,row_number() over (order by salary desc) as row_num from table_1
# 按照字段deptid分组后再按照salary倒序编号
select *,row_number() over (partition by deptid order by salary desc) as rank from table_1
# rank:总数不变,排序相同时会重复,会出现1,1,3这种。并列名次会占用下一名次位置。
# dense_rank:总数减小,排序相同时重复,出现1,1,2这种。并列名次不会占用下一名次位置。
# row_number():排序相同时不重复,会根据顺序排序,不考虑并列名次的情况。
# 获得income字段top10%的阈值
select percentile(CAST(salary as int), 0.9) as income_top10p_threshold from table_1
# 获取income字段的10个百分位点
select percentile(CAST(salary as int), array(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9)) as income_top10p_thresholds from table_1
# 转换为时间格式数据
select to_date('1970-01-01 00:00:00') as start_time from table_1
# 计算数据到当前时间的天数差
select datediff ('2016-12-30','2016-12-29') # 1
# datediff(enddate,stratdate):计算两个时间的时间差(day)
# date_sub(stratdate, interval 2 day) :返回开始日期startdate减少days天后的日期
# date_add(startdate,days) :返回开始日期startdate增加days天后的日期
重要的内置MySQL日期函数(链接):
函数 | 功能 |
---|---|
now() | 返回当前的日期和时间 |
curdate() | 返回当前的日期 |
curtime() | 返回当前的时间 |
date() | 提取日期,日期/时间表达式的日期部分 |
extract() | 返回日期/时间的单独部分 |
date_add() | 给日期添加指定的时间间隔 |
date_sub() | 从日期减去指定的时间间隔 |
datediff() | 返回两个日期之间的天数 |
date_format() | 用不同的格式显示日期/时间 |
day() | 取时间字段的天值 |
month() | 取时间字段的月值 |
year() | 取时间字段的年值 |
truncate:只删除表数据,保留表结构,不能加where
例:有3个表S,C,SC:
S(SNO,SNAME)代表(学号,姓名)
C(CNO,CNAME,CTEACHER)代表(课号,课名,教师)
SC(SNO,CNO,SCGRADE)代表(学号,课号,成绩)
问题:
1. 找出没选过“黎明”老师的所有学生姓名。
select sname from s where SNO not in
(
select SNO from SC where CNO in
(
select distinct CNO from C where CTEACHER == '黎明'
)
)
2. 列出2门以上(含2门)不及格学生姓名及平均成绩。
select s.sname, avg_grade from s
join
(select sno from sc where scgrade < 60 group by sno having count(*) >= 2) t1
on s.sno = t1.sno
join
(select sno, avg(scgrade) as avg_grade from sc group by sno ) t2
on s.sno = t2.sno;
3. 既学过1号课程又学过2号课所有学生的姓名。
select SNAME from
(select SNO from SC where CNO = 1) a
join
(select SNO from SC where CNO = 2) b
on a.sno = b.sno
# table A
Order_id User_id Add_time
11701245001 10000 1498882474
11701245002 10001 1498882475
# table B
id Order_id goods_id price
1 11701245001 1001 10
2 11701245001 1002 20
3 11701245002 1001 10
购买过goods_id 为1001的用户user_id:
A. select a.user_id from A a, B b where a.order_id = b.order_id and b.goods_id = '1001'
B. select user_id from A where order_id in (select order_id from B where goods_id = '1001')
C. select A.user_id from A left join B on A.order_id = B.order_id and B.goods_id = '1001'
# 使用SQL语句建个存储过程proc_stu,然后以student表中的学号Stu_ID为输入参数@s_no,返回学生个人的指定信息
CREATE PROCEDURE [stu].[proc_student]
@s_no AS int
AS
BEGIN
select * from stu.student where Stu_ID=@s_no
END
CREATE PROCEDURE [stu].[proc_student]
@s_no int
AS
BEGIN
select * from stu.student where Stu_ID=@s_no
END
select * from user_table where username=’xxx’ and password=’xxx’ or ‘1’=’1’,查询到所有用户信息,是单引号导致逻辑发生变化,达到恶意攻击的效果
select into:从一个表格中选取数据,然后把数据插入到另一个表中。通常用于创建表的备份或者对记录进行存档。
0.21.
及之后的版本采用,之前的版本没有。RBFSampler
, Nystroem
原始的数据集在UCI machine learning reposity,包含10992个样本,sklearn这个版本是只去了其中的部分样本:
特征数目:64(8x8)
维基百科也关于鸢(yuān)尾花卉数据集(Anderson’s Iris data set)有详细的介绍:
用途:预测花卉属于哪一类
sklearn.datasets
,演示的函数包括:make_classification
,make_blobs
,make_gaussian_quantiles
sklearn.datasets
中的函数make_multilabel_classification
生成多标签的数据。模型:SVM高斯内核
data = digits.images.reshape((n_samples, -1))
,把原来的(1797, 8, 8)数据转换为(1797, 64)维,进行训练cluster.FeatureAgglomeration
###============================================================ [notebook]
C
(Penalty parameter C of the error term.)取不同的数值大小时,随对应的cross_val_score
。kernel
指定)比较效果。模型:线性模型LassoCV和Lasso
连续分布:
参数μ和σ决定了分布的均值和标准差。其CDF是S形的曲线。
术语:
Refer to Adobe Illustrator CC 2017 MAC中文破解版
command+7
快捷键,即完成剪切操作。object -> transform -> transform each
shift+option+cmd+d
select -> same -> fill & stroke
option+cmd+7
。object -> group
cmd+g
object -> ungroup
shift+cmd+g
V键
,切换到选择工具
,单击待填充对象2,然后按下I键
,切换到吸管工具
,鼠标变为一支吸管,在对象1单击,就可以将对象1的填充颜色复制填充到对象2上。V键
,切换到选择工具
,单击待填充对象2,单击工具箱中的轮廓
,将轮廓色
放在上方。按下I键
,切换到吸管工具
,鼠标变为一支吸管,按着Shift键
,在对象1单击吸取颜色,这样就可以将吸取的颜色放在对象2的轮廓上了。V键
,切换到选择工具
,单击待填充对象,然后按下I键
,切换到吸管工具
,鼠标变为一支吸管,按住鼠标左键不放,将光标移到AI界面之外的任意目标颜色处,释放鼠标,即可将吸取的颜色填充到需要改变颜色的图像上面。command+t
),设置文字:VA加下划线
的,即可调整间距。可参考这里 画
一个文本框,具体就是:选中文本按钮,在画布中拖出一个一定长宽的文本框,那么这个框里面的文字是会自动换行的。shift+cmd+D
shift+option+cmd+d
,但是不小心点错了artboard options
Document Setup
,有edit board
选项,也可设置对象
=> 形状
=> 转换为形状
option+→
:扩展字间距,option+←
:缩小字间距,option+↑
:缩小行距,option+↓
:扩展行距官方博客图片:
累积分布函数:
术语:
假设检验:
术语: