|
找一份单词表内容格式如下:

经过php代码处理提取单词并保存到新的文件
$file = fopen("word.txt", "r");
$words=array();
$new_file = fopen('new_word.txt','a');
$i=0;
//输出文本中所有的行,直到文件结束为止。
while(! feof($file))
{
$str= fgets($file);//fgets()函数从文件指针中读取一行
preg_match('/^([a-zA-Z]+)\s+/',$str,$matches);
if (!empty($matches[1])){
$words[$i] = $matches[1];
}
$words[$i] = preg_replace('# #', '', $words[$i]);
if (!empty($words[$i])){
echo $words[$i] .&#34;<br>&#34;;
$words[$i] = $words[$i].&#34;\n&#34;;
fputs($new_file,$words[$i],strlen($words[$i]));
}
$i++;
}
fclose($file);
fclose($new_file);文件内容如下:

获取音频
$file = fopen(&#34;new_word.txt&#34;, &#34;r&#34;);
$words=array();
$i=0;
//输出文本中所有的行,直到文件结束为止。
while(! feof($file))
{
$str= fgets($file);//fgets()函数从文件指针中读取一行
echo $str;
$str = substr($str,0,strlen($str)-1);
echo $str;
$output = file_get_contents(&#34;http://dict.youdao.com/dictvoice?audio=$str&type=2&#34;);
file_put_contents(&#34;./records/$str&#34;.&#34;.mp3&#34;,$output);
}
fclose($file);

抓取界面
function get_word_msg($word_url, $word)
{
file_put_contents(__DIR__ . &#39;/htmls/&#39; . $word . &#34;.html&#34;, file_get_contents($word_url));
$html = new simple_html_dom();
$html->load_file(__DIR__ . &#39;/htmls/&#39; . $word . &#34;.html&#34;);
$web_word = $html->find(&#39;#cigencizui-word&#39;, 0)->plaintext;
if (strcasecmp($web_word, $word) != 0) {
var_dump($web_word);
var_dump($word);
return $this->error(&#39;出错&#39;);
}
$html->clear();
sleep(1);
}
function grab_word($word)
{
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, &#34;http://www.dicts.cn/dict/dict/dict!searchhtml3.asp?id=$word&#34;);
curl_setopt($curl, CURLOPT_HEADER, 1);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$data = curl_exec($curl);
curl_close($curl);
$data = strstr($data, &#39;dictword&#39;);
$real_url = &#34;http://www.dicts.cn/&#34; . $data;
$this->get_word_msg($real_url,$word);
}
// 通过循环可以抓取所有单词利用simple_html_dom分析网页抓取内容
public function grab_word_act(){
if (!is_dir(__DIR__ . &#34;/htmls&#34;)){
mkdir(__DIR__ . &#34;/htmls&#34;);
}
if (!is_dir(__DIR__ . &#34;/images&#34;)){
mkdir(__DIR__ . &#34;/images&#34;);
}
$file = fopen(__DIR__.&#34;/new_word.txt&#34;, &#34;r&#34;);
$i = 0 ;
while(!feof($file)) {
$str = fgets($file);
$str = substr($str, 0, strlen($str) - 1);
if (file_exists(__DIR__ . &#39;/htmls/&#39; . $str . &#34;.html&#34;)) {
$this->analysis_word($str);
}
$i++;
}
Db::table(&#39;h_dict_word&#39;)->insertAll(self::$words);
fclose($file);
}
public function analysis_word($word)
{
$html = new simple_html_dom();
$html->load_file(__DIR__ . &#39;/htmls/&#39; . $word . &#34;.html&#34;);
$yinbiao = $html->find(&#39;#cigencizui-word-pron>.en-UK&#39;, 0)->innertext;
$word_mean = $html->find(&#39;#cigencizui-word-info ul&#39;, 0)->innertext;
$data = $this->getEmptyArray(array(&#39;source&#39;, &#39;story&#39;, &#39;dictionary&#39;, &#39;symbol&#39;, &#39;mean&#39;, &#39;name&#39;,&#39;remember&#39;));
$web_word = $html->find(&#39;#cigencizui-word&#39;, 0)->plaintext;
if (strcasecmp($web_word, $word) != 0) {
return;
}
$data[&#39;name&#39;] = $word;
$data[&#39;symbol&#39;] = $yinbiao;
$data[&#39;mean&#39;] = $word_mean;
$divs = $html->find(&#39;#cigencizui-content .page-header~div&#39;);
if (!empty($divs)) {
$flag = &#34;&#34;;
foreach ($divs as $item) {
if (strpos($item->plaintext, &#39;词源说明&#39;) === 0) {
$flag = &#34;source&#34;;
} else if (strpos($item->plaintext, &#39;21世纪大&#39;) === 0) {
$flag = &#39;dictionary&#39;;
$data[&#39;dictionary&#39;] = array();
} else if (strpos($item->plaintext, &#34;不拘一格背单词&#34;) === 0) {
$flag = &#34;remember&#34;;
} else if(strpos($item->plaintext, &#34;词源故事&#34;)===0 ){
$flag = &#39;story&#39;;
} else {
if ($flag == &#39;source&#39;) {
$data[&#39;source&#39;] .= $item->innertext;
} else if ($flag == &#39;remember&#39;) {
$data[&#39;remember&#39;] .= $item->innertext;
}
else if($flag == &#39;story&#39;){
$data[&#39;story&#39;] .= $item->innertext;
}
}
}
if (array_key_exists(&#39;dictionary&#39;, $data)) {
$spans = $html->find(&#39;#cigencizui-content .word&#39;);
foreach ($spans as $item) {
$data[&#39;dictionary&#39;] = $item->innertext;
}
}
}
self::$words[] = $data;
if (count(self::$words)==10){
Db::table(&#39;h_dict_word&#39;)->insertAll(self::$words);
self::$words = array();
}
$html->clear();
} |
|