| 1 | %% Author: Caoyuan Deng, Anders Nygreni, Steve Vinoski |
|---|
| 2 | -module(tbray9a). |
|---|
| 3 | -compile([native, inline]). |
|---|
| 4 | -export([start/1]). |
|---|
| 5 | |
|---|
| 6 | -define(READ_SIZE, (1024 * 50000)). |
|---|
| 7 | -define(PAT, "/nehW/gniogno/ TEG\" ]"). % lists:reverse("] \"GET /ongoing/When/"). |
|---|
| 8 | -define(PAT_LEN, 21). % length("/nehW/gniogno/ TEG\" ]"). |
|---|
| 9 | -define(SKIP_LEN, 26). % length("] \"GET /ongoing/When/200x/"). |
|---|
| 10 | -define(KEY_OFFSET, 37). % length("] \"GET /ongoing/When/200x/2000/10/10/"). |
|---|
| 11 | |
|---|
| 12 | start([FileName]) -> |
|---|
| 13 | FileSize = filelib:file_size(FileName), |
|---|
| 14 | ProcN = FileSize div ?READ_SIZE, |
|---|
| 15 | ProcN1 = case FileSize rem ?READ_SIZE of 0 -> ProcN; _ -> ProcN + 1 end, |
|---|
| 16 | BmCtx = bm_compile(), |
|---|
| 17 | Results = scan_file(FileName, ?READ_SIZE, ProcN1, BmCtx), |
|---|
| 18 | Ets = ets:new(wfets, [set]), |
|---|
| 19 | Segs = lists:foldl(fun ({Keys, Head, Tail}, Segs) -> |
|---|
| 20 | lists:foreach(fun (Key) -> update_counter(Ets, Key) end, Keys), |
|---|
| 21 | [Head | [Tail | Segs]] |
|---|
| 22 | end, [], Results), |
|---|
| 23 | Segs1 = [Seg || {_, Seg} <- lists:keysort(1, Segs)], |
|---|
| 24 | SegKeys = scan_chunk([list_to_binary(Segs1), BmCtx]), |
|---|
| 25 | lists:foreach(fun (Key) -> update_counter(Ets, Key) end, SegKeys), |
|---|
| 26 | print_result(Ets), |
|---|
| 27 | halt(). |
|---|
| 28 | |
|---|
| 29 | update_counter(Ets, Key) -> |
|---|
| 30 | case catch ets:update_counter(Ets, Key, 1) of |
|---|
| 31 | {'EXIT', {badarg, _}} -> ets:insert(Ets, {Key, 1}); |
|---|
| 32 | _ -> ok |
|---|
| 33 | end. |
|---|
| 34 | |
|---|
| 35 | scan_file(FileName, Size, ProcN, BmCtx) -> |
|---|
| 36 | Workers = [spawn_worker(self(), fun pscan_file/1, [FileName, Size, I, BmCtx], []) |
|---|
| 37 | || I <- lists:seq(0, ProcN - 1)], |
|---|
| 38 | [wait_result(Worker) || Worker <- Workers]. |
|---|
| 39 | |
|---|
| 40 | pscan_file([FileName, Size, I, BmCtx]) -> |
|---|
| 41 | {ok, File} = file:open(FileName, [read, raw, binary]), |
|---|
| 42 | {ok, Bin} = file:pread(File, Size * I, Size), |
|---|
| 43 | file:close(File), |
|---|
| 44 | HeadL = split_on_first_newline(Bin, 0, 1), |
|---|
| 45 | TailS = split_on_first_newline(Bin, size(Bin) - 1, -1), |
|---|
| 46 | DataL = TailS - HeadL, |
|---|
| 47 | <<Head:HeadL/binary, Data:DataL/binary, Tail/binary>> = Bin, |
|---|
| 48 | {scan_chunk([Data, BmCtx]), {I * 10, Head}, {I * 10 + 1, Tail}}. |
|---|
| 49 | |
|---|
| 50 | split_on_first_newline(Bin, S, Direction) -> |
|---|
| 51 | case Bin of |
|---|
| 52 | <<_:S/binary,$\n,_/binary>> -> S + 1; |
|---|
| 53 | <<_:S/binary,_,_/binary>> -> split_on_first_newline(Bin, S + Direction, Direction); |
|---|
| 54 | _ -> S |
|---|
| 55 | end. |
|---|
| 56 | |
|---|
| 57 | scan_chunk([Bin, BmCtx]) -> scan_chunk_1(Bin, size(Bin), 0, [], BmCtx). |
|---|
| 58 | scan_chunk_1(Bin, DataL, S, Keys, BmCtx) when S < DataL - ?KEY_OFFSET -> |
|---|
| 59 | case bm_match(Bin, S + ?PAT_LEN, ?PAT, BmCtx, 1) of |
|---|
| 60 | {true, _} -> |
|---|
| 61 | case match_until_space_newline(Bin, S + ?KEY_OFFSET, size(Bin)) of |
|---|
| 62 | {true, E} -> |
|---|
| 63 | Skip = S + ?SKIP_LEN, L = E - Skip, |
|---|
| 64 | <<_:Skip/binary, Key:L/binary, _/binary>> = Bin, |
|---|
| 65 | scan_chunk_1(Bin, DataL, E, [Key | Keys], BmCtx); |
|---|
| 66 | {false, E} -> |
|---|
| 67 | scan_chunk_1(Bin, DataL, E, Keys, BmCtx) |
|---|
| 68 | end; |
|---|
| 69 | {false, Shift} -> |
|---|
| 70 | scan_chunk_1(Bin, DataL, S + Shift, Keys, BmCtx) |
|---|
| 71 | end; |
|---|
| 72 | scan_chunk_1(_, _, _, Keys, _) -> Keys. |
|---|
| 73 | |
|---|
| 74 | match_until_space_newline(Bin, S, DataL) when S < DataL -> |
|---|
| 75 | case Bin of |
|---|
| 76 | <<_:S/binary,10,_/binary>> -> {false, S}; |
|---|
| 77 | <<_:S/binary,$.,_/binary>> -> {false, S}; |
|---|
| 78 | <<_:S/binary,_,$ ,_/binary>> -> {true, S + 1}; |
|---|
| 79 | _ -> match_until_space_newline(Bin, S + 1, DataL) |
|---|
| 80 | end; |
|---|
| 81 | match_until_space_newline(_, S, _) -> {false, S}. |
|---|
| 82 | |
|---|
| 83 | print_result(Tab) -> |
|---|
| 84 | SortedList = lists:reverse(lists:keysort(2, ets:tab2list(Tab))), |
|---|
| 85 | [io:format("~b\t: ~s~n", [V, K]) || {K, V} <- lists:sublist(SortedList, 10)]. |
|---|
| 86 | |
|---|
| 87 | spawn_worker(Parent, F, Args, Opts) -> |
|---|
| 88 | erlang:spawn_opt(fun() -> Parent ! {self(), F(Args)} end, [monitor | Opts]). |
|---|
| 89 | |
|---|
| 90 | wait_result({Pid, Ref}) -> |
|---|
| 91 | receive |
|---|
| 92 | {'DOWN', Ref, _, _, normal} -> receive {Pid, Result} -> Result end; |
|---|
| 93 | {'DOWN', Ref, _, _, Reason} -> exit(Reason) |
|---|
| 94 | end. |
|---|
| 95 | |
|---|
| 96 | bm_compile() -> |
|---|
| 97 | Default = dict:from_list([{C, ?PAT_LEN} || C <- lists:seq(1, 255)]), |
|---|
| 98 | Dict = bm_set_shifts(lists:reverse(?PAT), ?PAT_LEN, 1, Default), |
|---|
| 99 | list_to_tuple([Pos || {_, Pos} <- lists:sort(dict:to_list(Dict))]). |
|---|
| 100 | |
|---|
| 101 | bm_set_shifts([C|T], StrLen, Pos, Dict) -> |
|---|
| 102 | bm_set_shifts(T, StrLen, Pos + 1, dict:store(C, StrLen - Pos, Dict)); |
|---|
| 103 | bm_set_shifts([], _, _, Dict) -> Dict. |
|---|
| 104 | |
|---|
| 105 | bm_match(Bin, S, [C|T], Tab, Count) -> |
|---|
| 106 | case Bin of |
|---|
| 107 | <<_:S/binary, C, _/binary>> -> |
|---|
| 108 | bm_match(Bin, S - 1, T, Tab, Count + 1); |
|---|
| 109 | <<_:S/binary, C1, _/binary>> -> |
|---|
| 110 | case element(C1, Tab) of |
|---|
| 111 | ?PAT_LEN -> {false, ?PAT_LEN}; |
|---|
| 112 | Shift when Shift < Count -> {false, 1}; |
|---|
| 113 | Shift -> {false, Shift - Count + 1} |
|---|
| 114 | end |
|---|
| 115 | end; |
|---|
| 116 | bm_match(_, _, [], _, _) -> {true, ?PAT_LEN}. |
|---|