Помогите подправить код парсера на ПХП

184

WhiteFalke

7 октября 2007, 11:03

908

Добрый день.

Есть парсер новостей. Не чего примечательного указывам ленту, парсит, выводит.

Но есть две проблемы.

1) Проблема с кодировкой все в УТФ как я понимаю отображается

2) Не хотелось бы что бы присутствовали внешние ссылки, что бы они были именно как гипер ссылки. Хотелось бы что бы это был просто текст.

Два файла идет. Первый rss.php и второй rss_fetch.php

Первый:

<?

header("Content-Type: text/html; charset=windows-1251");

?>

<p>

<tr>

</td>

</tr>

<?php

include_once "./rss_fetch.php";

$html = " <tr>\n";

$html .= " <td style='background-color: #F1F5F8; font-size: 13px;'>\n";

$html .= " <a href='#{link}' target='_new'>#{title}</a><br />\n";

$html .= " #{description}<br />\n";

$html .= " <font size=1>#{pubDate}<br /><br />\n";

$html .= " </td>\n";

$html .= " </tr>\n";

$rss = new rss_parser("http://rssmix.ru/908", 10, $html, 1);

?>

</table>

Второй:

<?php

/************************************************************

RSS Fetch 0.4.3 (23 July 2005)

RSS Feed Reader

Author: Drew Phillips

www.neoprogrammers.com

Please rate this script at http://www.hotscripts.com/rate/48456.html

Then it will remain popular and others will be able to find it easier.

Thanks

This program is free software; you can redistribute it and/or

modify it under the terms of the GNU General Public License

as published by the Free Software Foundation; either version 2

of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with this program; if not, write to the Free Software

Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

************************************************************/

class rss_parser {

var $update_interval = 60;

/* How often to fetch the rss file

A cached version will be used between updates */

var $data_directory = "C:\Program Files\VertrigoServ\www\rss";

/* Where to store the rss data from the feeds

Note: an absolute path is better than a relative path here

unless you plan on keeping the script to display the feeds

in the same folder as this file and the feeds. */

/* NO NEED TO EDIT BELOW HERE UNLESS YOU KNOW WHAT YOU ARE DOING */

var $rss_url;

var $num_to_show;

var $offset; //added in version 0.4.3

var $do_update;

var $tags = array();

var $content;

var $rss = array();

var $feed_title;

var $feed_link;

var $feed_description;

var $my_html;

function rss_parser($url, $numtoshow = 10, $html = "", $update = FALSE, $offset = 1)

{

$this->rss_url = $url;

$this->num_to_show = $numtoshow;

$this->do_update = $update;

$this->my_html = preg_replace("/(#{.*?):(.*?})/", "\\1__\\2", $html); //xx:xx tag workaround

$this->offset = --$offset;

$this->content = $this->fetch_feed();

$this->parse_feed();

$this->show();

}

/* string */

function fetch_feed()

{

$url_parts = parse_url($this->rss_url);

$filename = $url_parts['host'] . str_replace("/", ",", $url_parts['path']) . "_" . @$url_parts['query'];

if(file_exists($this->data_directory . "/$filename")) {

$last = filemtime($this->data_directory . "/$filename");

$create = 0;

if(time() - $last > $this->update_interval * 60 || $this->update_interval == 0) {

$update = 1;

} else {

$update = 0;

}

} else {

$create = 1;

$update = 1;

}

if($create == 1 || ($this->do_update == TRUE && $update == 1)) {

$fp = @fsockopen($url_parts['host'], 80, $errno, $errstr, 5);

if (!$fp) {

echo "Couldn't open rss feed {$this->feed_url} in {$_SERVER['PHP_SELF']}<br />\n";

return;

}

fputs($fp, "GET {$this->rss_url} HTTP/1.0\r\n"

."Host: " . $url_parts['host'] . "\r\n"

."User-Agent: Drew's RSS Reader 0.1\r\n"

."Connection: Close\r\n\r\n");

while(!feof($fp)) {

$rss_data .= @fread($fp, 1024);

}

list(, $rss_data) = explode("\r\n\r\n", $rss_data, 2);

$output = @fopen($this->data_directory . "/$filename", "w+b");

if(!$output) {

return $rss_data;

} else {

flock($output, LOCK_EX);

fputs($output, $rss_data);

flock($output, LOCK_UN);

fclose($output);

}

} //update

return file_get_contents($this->data_directory . "/$filename");

}

/* void */

function parse_feed()

{

preg_match("/<title>(.*?)<\/title>/", $this->content, $title);

$this->feed_title = @$title[1];

preg_match("/<link>(.*?)<\/link>/", $this->content, $link);

$this->feed_link = @$link[1];

preg_match("/<description>(.*?)<\/description>/", $this->content, $description);

$this->feed_description = @$description[1];

preg_match_all("/<item[^>]*>(.*?)<\/item>/s", $this->content, $items);

if (sizeof($items[0]) == 0) {

echo "No item elements found in rss feed.<br />\n";

}

for($i = 0; $i < sizeof($items[0]); ++$i) {

preg_match_all("/(?:<([\w:]*)[^>]*>\s*(?:<!\[CDATA\[)?(.*?)(?:]]>)?\s*<\/\\1>)+?/si", preg_replace("/<item[^>]*>/", "", $items[0][$i]), $elements);

for($j = 0; $j < sizeof($elements[0]); ++$j) {

$elements[1][$j] = str_replace(":", "__", $elements[1][$j]); //regex fix for items with : like dc:date

$this->rss[$i][$elements[1][$j]] = trim($this->unhtmlentities($elements[2][$j]));

}

/* void */

function show()

{

if($this->my_html == "") {

$this->show_html();

} else {

$this->show_user_html();

}

function show_html()

{

$show = (sizeof($this->rss) > $this->num_to_show ? $this->num_to_show : sizeof($this->rss));

for($i = $this->offset; $i < $this->offset + $show; ++$i) {

echo "- <a href=\"{$this->rss[$i]['link']}\" target=\"_new\">{$this->rss[$i]['title']}</a><br />\n";

}

function show_user_html()

{

$show = (sizeof($this->rss) > $this->num_to_show + $this->offset ? $this->num_to_show : sizeof($this->rss));

$show = ($this->offset + $this->num_to_show > sizeof($this->rss) ? sizeof($this->rss) - $this->offset : $this->num_to_show);

for($i = $this->offset; $i < $this->offset + $show; ++$i) {

extract($this->rss[$i]);

$item = preg_replace("/#\{([^}]+)}/e", "$\\1", $this->my_html);

echo $item;

}

function unhtmlentities($string)

{

// replace numeric entities

$string = preg_replace('~&#x([0-9a-f]+);~ei', 'chr(hexdec("\\1"))', $string);

$string = preg_replace('~&#([0-9]+);~e', 'chr(\\1)', $string);

// replace literal entities

$trans_tbl = get_html_translation_table(HTML_ENTITIES);

$trans_tbl = array_flip($trans_tbl);

return strtr($string, $trans_tbl);

}

} // end class

?>

Возможно не мне одному будет полезен этот скрипт в рабочем состоянии.

184

WhiteFalke

7 октября 2007, 12:07

#1

С ссылками разобрался. Осталась кодировка

K

103

kip

7 октября 2007, 12:15

#2

iconv("UTF-8","Windows-1251",$a); - вставляй это когда выводишь

Надежные и недорогие VPS/VDS (http://goo.gl/iifGKa) Бананы за выкуп (http://goo.gl/ZpRgC1)

184

WhiteFalke

7 октября 2007, 12:35

#3

kip:
iconv("UTF-8","Windows-1251",$a); - вставляй это когда выводишь

Большое человеческое спасибо.

184

WhiteFalke

7 октября 2007, 12:50

#4

Первая проблема все же актуальна, убрал ссылку из нижней подписи, но ссылки бывают и в новостях, как избавится от них?

K

295

Kpd

7 октября 2007, 13:13

#5

WhiteFalke, strip_tags()

S

56

solnikolay

7 октября 2007, 16:03

#6

$search = array ("'<a[^>]*?>.*?</a>'si", // Вырезает ссылки

"'<[\/\!]*?[^<>]*?>'si", // Вырезает HTML-теги

"'([\r\n])[\s]+'", // Вырезает пробельные символы

"'&(quot|#34);'i", // Заменяет HTML-сущности

"'&(amp|#38);'i",

"'&(lt|#60);'i",

"'&(gt|#62);'i",

"'&(nbsp|#160);'i",

"'&(iexcl|#161);'i",

"'&(cent|#162);'i",

"'&(pound|#163);'i",

"'&(copy|#169);'i");

$replace = array ("",

"",

"\\1",

"\"",

"&",

"<",

">",

" ",

chr(161),

chr(162),

chr(163),

chr(169));

$text = preg_replace($search, $replace, $document);

Роман Мандрик Вирусный маркетинг: Что ищут люди сегодня: ТИЦ: интересные изменения 13-01-07

Яндекс Вебмастер вынес товарные фиды в отдельный раздел

Зачем быть уникальным в мире, где все можно скопировать