我必须对第一个答案进行很多更改,我才开始这个!
USE test
DROP TABLE IF EXISTS ngram_key;
DROP TABLE IF EXISTS ngram_rec;
DROP TABLE IF EXISTS ngram_blk;
CREATE TABLE ngram_key
(
NGRAM_ID UNSIGNED BIGINT NOT NULL AUTO_INCREMENT,
NGRAM VARCHAR(64) NOT NULL,
PRIMARY KEY (NGRAM),
KEY (NGRAM_ID)
) ENGINE=MyISAM ROW_FORMAT=FIXED PARTITION BY KEY(NGRAM) PARTITIONS 256;
CREATE TABLE ngram_rec
(
NGRAM_ID UNSIGNED BIGINT NOT NULL,
YR SMALLINT NOT NULL,
MC SMALLINT NOT NULL,
PC SMALLINT NOT NULL,
VC SMALLINT NOT NULL,
PRIMARY KEY (NGRAM_ID,YR)
) ENGINE=MyISAM ROW_FORMAT=FIXED;
CREATE TABLE ngram_blk
(
NGRAM VARCHAR(64) NOT NULL,
YR SMALLINT NOT NULL,
MC SMALLINT NOT NULL,
PC SMALLINT NOT NULL,
VC SMALLINT NOT NULL
) ENGINE=BLACKHOLE;
DELIMITER $$
CREATE TRIGGER populate_ngram AFTER INSERT ON ngram_blk FOR EACH ROW
BEGIN
DECLARE NEW_ID BIGINT;
INSERT IGNORE INTO ngram_key (NGRAM) VALUES (NEW.NGRAM);
SELECT NGRAM_ID INTO NEW_ID FROM ngram_key WHERE NGRAM=NEW.NGRAM;
INSERT IGNORE INTO ngram_rec VALUES (NEW_ID,NEW.YR,NEW.MC,NEW.PC,NEW.VC);
END; $$
DELIMITER ;
INSERT INTO ngram_blk VALUES
('rolando',1965,31,29,85),
('pamela',1971,33,21,86),
('dominique',1996,30,18,87),
('diamond',1998,13,28,88),
('rolando edwards',1965,31,29,85),
('pamela edwards',1971,33,21,86),
('dominique edwards',1996,30,18,87),
('diamond edwards',1998,13,28,88),
('rolando angel edwards',1965,31,29,85),
('pamela claricia edwards',1971,33,21,86),
('dominique sharlisee edwards',1996,30,18,87),
('diamond ashley edwards',1998,13,28,88);
UPDATE ngram_rec SET yr=yr+1,mc=mc+30,pc=pc+30,vc=vc+30;
INSERT INTO ngram_blk VALUES
('rolando',1965,31,29,85),
('pamela',1971,33,21,86),
('dominique',1996,30,18,87),
('diamond',1998,13,28,88),
('rolando edwards',1965,31,29,85),
('pamela edwards',1971,33,21,86),
('dominique edwards',1996,30,18,87),
('diamond edwards',1998,13,28,88),
('rolando angel edwards',1965,31,29,85),
('pamela claricia edwards',1971,33,21,86),
('dominique sharlisee edwards',1996,30,18,87),
('diamond ashley edwards',1998,13,28,88);
UPDATE ngram_rec SET yr=yr+1,mc=mc+30,pc=pc+30;
INSERT INTO ngram_blk VALUES
('rolando',1965,31,29,85),
('pamela',1971,33,21,86),
('dominique',1996,30,18,87),
('diamond',1998,13,28,88),
('rolando edwards',1965,31,29,85),
('pamela edwards',1971,33,21,86),
('dominique edwards',1996,30,18,87),
('diamond edwards',1998,13,28,88),
('rolando angel edwards',1965,31,29,85),
('pamela claricia edwards',1971,33,21,86),
('dominique sharlisee edwards',1996,30,18,87),
('diamond ashley edwards',1998,13,28,88);
UPDATE ngram_rec SET yr=yr+1,mc=mc+30;
SELECT * FROM ngram_key;
SELECT * FROM ngram_rec;
SELECT A.ngram NGram,B.yr Year,B.mc Matches,B.pc Pages,B.vc Volumes FROM
ngram_key A,ngram_rec B
WHERE A.ngram='rolando angel edwards'
AND A.ngram_id=B.ngram_id;
用于年份信息的表要小得多,但是保留原始ngram的键要大得多。我还增加了测试数据量。您可以将其直接剪切并粘贴到MySQL中。
警告
只需删除ROW_FORMAT,它就会变得非常动态,并将ngram_key表压缩得更小。
DiskSpace指标
nrgram_rec每行有17个字节ngram_id有
8个字节(最大无符号值18446744073709551615 [2 ^ 64-1])
8个字节有4个smallint(每个2个字节)
1个字节MyISAM内部删除标志
ngram_rec的索引条目= 10字节(8(ngram_id)+ 2(yr))
4,700万行X每行17个字节= 0799百万字节= 761.98577 MB
4,700万行X每行12个字节= 0564百万字节= 537.85231 MB
4,700万行X每行29个字节= 13.63亿字节= 1.269393 GB
50亿行X每行17个字节= 8850亿字节= 079.1624 GB
50亿行X每行12个字节= 6600亿字节= 055.8793 GB
50亿行X每行29个字节= 1450亿字节= 135.0417 GB
ngram_key有73个字节,ngram有64个字节(ROW_FORMAT = FIXED将varchar设置为char)ngram_id有8个字节1个字节MyISAM内部删除标志
ngram_key的2个索引条目= 64字节+ 8字节= 72字节
4,700万行X每行073字节= 3431百万字节= 3.1954 GB
4700万行X每072字节= 33.84亿字节= 3.1515 GB
4700万行X 145字节每行= 681.5亿字节= 6.3469 GB
50亿行X每行073字节= 3650亿字节= 339.9327 GB
50亿行X每行072字节= 3600亿字节= 335.2761 GB
50亿行X每行145字节= 7250亿字节= 675.2088 GB